# # 13 Oct 19 # # C source data from "The New C Standard" # Mathematics data from An Analysis of Mathematical Expressions Used in Practice Op Freq Percent = 128715 0.39398984 - 116064 0.35526580 , 112818 0.34532996 103090 0.31555307 + 79404 0.24305147 43942 0.13450415 * 29210 0.08941027 23818 0.07290564 23405 0.07164147 <= 20088 0.06148831 14242 0.04359401 16875 0.05165349 13560 0.04150645 > 13528 0.04140850 13138 0.04021473 12451 0.03811186 < 12058 0.03690890 12005 0.03674667 11940 0.03654771 11294 0.03457034 Total=32669624 Punctuator % of Tokens Punctuator % of Tokens Punctuator % of Tokens , 8.82 == 0.53 || 0.16 ) 8.09 : 0.46 += 0.11 ( 8.09 -v 0.40 > 0.11 ; 7.80 *p 0.40 << 0.09 = 3.08 + 0.38 ?: 0.08 -> 3.00 *v 0.34 ? 0.08 } 1.87 & 0.32 |= 0.08 { 1.87 ! 0.31 >= 0.07 . 1.26 v++ 0.27 / 0.06 * 1.10 && 0.26 >> 0.06 # 1.00 != 0.26 ~ 0.05 ] 0.96 < 0.22 v-- 0.04 [ 0.96 - 0.19 &= 0.04 &v 0.58 | 0.17 <= 0.04 + 0.182 0.233 , 1.565 1.914 - 1.176 0.831 n 48,150 i 43,280 x 36,240 k 32,060 t 25,967 X 23,369 j 23,038 p 22,832 A 22,791 124984 n-1 104930 1/2 88735 n+1 69340 i =1 50466 k-1 48712 i+1 48539 k+1 7972 2n+1 6718 2n-1 5515 2k+1 4197 2k-1 2788 2m+1 2770 2i-1 subexpressions maths 2 16892594 3 7063566 4 3329619 5 2174033 6 1084395 7 880405 8 508236 9 417702 10 270138 11 239619 12 163657 13 148238 14 110081 15 103399 Total=34338150 Operators in expressions in C 0 1194211 1 637206 2 122388 3 59027 4 18425 5 12528 6 4178 7 3954 8 1585 9 1435 10 626 11 825 12 300 13 336 14 204 15 281 16 128 17 140 18 52 19 85 20 50 se=c( 16892594, 7063566, 3329619, 2174033, 1084395, 880405, 508236, 417702, 270138, 239619, 163657, 148238, 110081, 103399) total_se= 34338150 # 100*se/total_se m_op_expr=data.frame(expr=100*se/total_se, ops=1+(1:length(se))) c_eop=c(1194211, 637206, 122388, 59027, 18425, 12528, 4178, 3954, 1585, 1435, 626, 825, 300, 336, 204, 281, 128, 140, 52, 85, 50) # total_c_eop=2057964 # 100*c_eop/total_c_eop c_2eop=c_eop[-(1:2)] total_c_2eop=sum(c_2eop) c_2op_expr=data.frame(expr=100*c_2eop/total_c_2eop, ops=1+(1:length(c_2eop))) plot(m_op_expr$op, m_op_expr$expr, log="y", col="red", las=1, xlab="Operators", ylab="Expressions (percentage)") points(c_2op_expr$ops, c_2op_expr$expr, col="blue") legend(x="topright", legend=c("Mathematics", "C source"), bty="n", fill=c("red", "blue"), cex=1.2) m_mod=glm(log(expr) ~ ops, data=m_op_expr) summary(m_mod) pred=predict(m_mod) lines(m_op_expr$ops, exp(pred), col="red") # subset the data to prevent rare uses skewing the fit c_mod=glm(log(expr) ~ ops, data=c_2op_expr, subset=1:10) summary(c_mod) pred=predict(c_mod) lines(c_2op_expr$ops[1:10], exp(pred), col="blue")