#
# 13 Oct 19
#
# C source data from "The New C Standard"
# Mathematics data from An Analysis of Mathematical Expressions Used in Practice

Op  Freq   Percent
=   128715 0.39398984
-   116064 0.35526580
,   112818 0.34532996
    103090 0.31555307
+   79404  0.24305147
    43942  0.13450415
*   29210  0.08941027
    23818  0.07290564
    23405  0.07164147
<=  20088  0.06148831
    14242  0.04359401
    16875  0.05165349
    13560  0.04150645
>   13528  0.04140850
    13138  0.04021473
    12451  0.03811186
<   12058  0.03690890
    12005  0.03674667
    11940  0.03654771
    11294  0.03457034

Total=32669624

Punctuator % of Tokens Punctuator % of Tokens Punctuator % of Tokens
, 8.82 == 0.53 || 0.16
) 8.09 : 0.46 += 0.11
( 8.09 -v 0.40 > 0.11
; 7.80 *p 0.40 << 0.09
= 3.08 + 0.38 ?: 0.08
-> 3.00 *v 0.34 ?  0.08
} 1.87 & 0.32 |= 0.08
{ 1.87 !  0.31 >= 0.07
.  1.26 v++ 0.27 / 0.06
* 1.10 && 0.26 >> 0.06
# 1.00 != 0.26 ~ 0.05
] 0.96 < 0.22 v-- 0.04
[ 0.96 - 0.19 &= 0.04
&v 0.58 | 0.17 <= 0.04

 + 0.182 0.233
 , 1.565 1.914
 - 1.176 0.831
 

 n 48,150
 i 43,280
 x 36,240
 k 32,060
 t 25,967
 X 23,369
 j 23,038
 p 22,832
 A 22,791

124984 n-1
104930 1/2
 88735 n+1
 69340 i =1
 50466 k-1
 48712 i+1
 48539 k+1
 

7972 2n+1
6718 2n-1
5515 2k+1
4197 2k-1
2788 2m+1
2770 2i-1

subexpressions maths
2  16892594
3   7063566
4   3329619
5   2174033
6   1084395
7    880405
8    508236
9    417702
10   270138
11   239619
12   163657
13   148238
14   110081
15   103399
 
Total=34338150

Operators in expressions in C

0 1194211
1 637206
2 122388
3 59027
4 18425
5 12528
6 4178
7 3954
8 1585
9 1435
10 626
11 825
12 300
13 336
14 204 
15 281
16 128
17 140
18 52 
19 85 
20 50


se=c( 16892594,
   7063566,
   3329619,
   2174033,
   1084395,
   880405,
   508236,
   417702,
   270138,
   239619,
   163657,
   148238,
   110081,
   103399)

total_se= 34338150
# 100*se/total_se
m_op_expr=data.frame(expr=100*se/total_se, ops=1+(1:length(se)))

c_eop=c(1194211,
	637206,
	122388,
	59027,
	18425,
	12528,
	4178,
	3954,
	1585,
	1435,
	626,
	825,
	300,
	336,
	204,
	281,
	128,
	140,
	52,
	85,
	50)

# total_c_eop=2057964
# 100*c_eop/total_c_eop

c_2eop=c_eop[-(1:2)]
total_c_2eop=sum(c_2eop)

c_2op_expr=data.frame(expr=100*c_2eop/total_c_2eop, ops=1+(1:length(c_2eop)))

plot(m_op_expr$op, m_op_expr$expr, log="y", col="red",
	las=1,
	xlab="Operators", ylab="Expressions (percentage)")
points(c_2op_expr$ops, c_2op_expr$expr, col="blue")

legend(x="topright", legend=c("Mathematics", "C source"), bty="n", fill=c("red", "blue"), cex=1.2)

m_mod=glm(log(expr) ~ ops, data=m_op_expr)
summary(m_mod)

pred=predict(m_mod)
lines(m_op_expr$ops, exp(pred), col="red")


# subset the data to prevent rare uses skewing the fit
c_mod=glm(log(expr) ~ ops, data=c_2op_expr, subset=1:10)
summary(c_mod)

pred=predict(c_mod)
lines(c_2op_expr$ops[1:10], exp(pred), col="blue")