# chkdbl.awk, 20 Aug 18 # # Check for double words, unmatched (), sentences missing a . # and a/an [aeiou] # function chk_a_an() { if ((last_word == "a") && ((substr(orig_word, 1, 1) ~ /[aeiou]/) || (orig_word == "11") || (orig_word == "8"))) print "!!! a should be an| " $0 else if ((last_word == "an") && !(((substr(orig_word, 1, 1) ~ /[aeiou]/) || (orig_word == "11") || (orig_word == "8")))) print "!!! an should be a| " $0 } function chk_last_first_letter() { #new_last_word=last_word substr(cur_word, 1, 1) #new_cur_word=substr(cur_word, 2) new_last_word=substr(last_word, 1, length(last_word)-1) new_cur_word=substr(last_word, length(last_word)) cur_word if ((cur_ignore[new_cur_word] == 1) || \ (last_ignore[new_last_word] == 1) || \ (last_cur_ignore[new_last_word new_cur_word] == 1)) return if ((word_list[new_last_word] == 1) && \ (word_list[new_cur_word] == 1)) { print "$$$ space/letter transpose " new_last_word ":" new_cur_word " | " $0 last_cur_match[new_last_word " " new_cur_word]++ } } BEGIN { lastm2_word="" lastm1_word="" last_last_word="" last_last_ignore["as"]=1 last_last_ignore["of"]=1 last_last_ignore["or"]=1 last_last_ignore["to"]=1 last_last_ignore["and"]=1 last_last_ignore["the"]=1 last_word="" line_num=0; if (WORDS == "") chk_word_pairs=0 else { chk_word_pairs=1 while (getline < WORDS) { word_list[$0]=1 } } sus_words["din"]=1 sus_words["dis"]=1 sus_words["don"]=1 sus_words["shave"]=1 sus_words["sin"]=1 sus_words["sis"]=1 sus_words["sonly"]=1 sus_words["snot"]=1 sus_words["tis"]=1 sus_words["twas"]=1 sus_word_pairs["inone"]=1 sus_pair["a one"]=1 sus_pair["a the"]=1 sus_pair["and having"]=1 sus_pair["and is"]=1 sus_pair["any with"]=1 sus_pair["are have"]=1 sus_pair["are tend"]=1 sus_pair["by to"]=1 sus_pair["for same"]=1 sus_pair["from for"]=1 sus_pair["have make"]=1 sus_pair["is comes"]=1 sus_pair["keeps gives"]=1 sus_pair["of from"]=1 sus_pair["of whose"]=1 sus_pair["only could"]=1 sus_pair["sufficient of"]=1 sus_pair["the a"]=1 sus_pair["the that"]=1 sus_pair["the what"]=1 sus_pair["there the"]=1 sus_pair["there unlikely"]=1 sus_pair["be uses"]=1 sus_pair["to used"]=1 sus_pair["tot he"]=1 sus_pair["use din"]=1 sus_pair["with do"]=1 sus_pair["using with"]=1 sus_pair["usual asked"]=1 last_ignore["asp"]=1 last_ignore["ares"]=1 last_ignore["butt"]=1 last_ignore["hast"]=1 last_ignore["ism"]=1 last_ignore["fort"]=1 last_ignore["oft"]=1 last_ignore["sot"]=1 last_ignore["tot"]=1 last_ignore["wast"]=1 cur_ignore["educe"]=1 cur_ignore["hem"]=1 cur_ignore["unction"]=1 cur_ignore["unctions"]=1 cur_ignore["umber"]=1 last_cur_ignore["bethe"]=1 last_cur_ignore["blockscope"]=1 last_cur_ignore["filescope"]=1 last_cur_ignore["amember"]=1 last_cur_ignore["theevaluation"]=1 last_cur_ignore["whitespace"]=1 } { line_num++; } $1 == "" || $0 == ".E_CODE" { in_code=0 next } $0 == "" || $0 == "C99" || $0 == "C90" || $0 == "C++" || $0 == "Common implementations" || $0 == "Other languages" || $0 == "Coding guidelines" || $0 == "Example" || $0 == "Usage" || $0 == ".BULLET" || $0 == ".E_BULLET" || $0 == ".TABLE" || $0 == ".E_TABLE" || $0 == ".FIG" || $0 == ".E_FIG" || $0 == ".QUOTE" || $0 == ".E_QUOTE" || $0 == ".DIFF" || $0 == ".E_DIFF" { last_word="not a word" next } $0 == "" { # if ((dot_last == 0) && (in_code == 0)) # print "... " last_line if (paren_nest != 0) print "((( " line_num " " last_line paren_nest=0 dot_last=1 next } { cur_line=$0 paren_nest+=(gsub(/\(/, " ", cur_line)-gsub(/\)/, " ", cur_line)) dot_last=(substr($0, length($0)) == ".") last_line=$0 for (ind=1; ind <= NF; ind++) { cur_word=tolower($ind) orig_word=cur_word # We don't want a following full stop or comma to # affect matches against previous words (only following words) gsub("\\.", "", cur_word) gsub("\\(", "", cur_word) gsub("\\)", "", cur_word) gsub(",", "", cur_word) gsub(";", "", cur_word) gsub(":", "", cur_word) if (last_last_word == cur_word) { if (!last_last_ignore[last_last_word]) print line_num ": " last_last_word ":: " $0 } if (last_word == cur_word) { print line_num ": :" last_word ": " $0 } if ((lastm2_word lastm1_word) == (last_word cur_word)) { print line_num "| " lastm2_word " " lastm1_word "|" last_word " " $ind "| " $0 } if (sus_pair[last word " " cur_word] == 1) { print "sus pair " last_word ":" cur_word " " $0 } lastm2_word=lastm1_word lastm1_word=last_word gsub("\\(", "", orig_word) chk_a_an() if (sus_words[cur_word] == 1) print "%%% suspicious word " cur_word " | " $0 # A very high percentage of false positives if (chk_word_pairs) chk_last_first_letter() last_last_word=last_word last_word=orig_word } next } END { for (w in last_cur_match) print last_cur_match[w] " " w }