# # railroad.awk, 11 Jun 12 # # Take a yacc/bison file as input and generate railroad diagrams # (well, at least something to feed into dot and which then ...) function save_rhs() { rule_rhs[rule_name]=rule_rhs[rule_name] "|" rhs_str # not yet used all_rule[num_lhs]=all_rule[num_lhs] "|" rhs_str rhs_str="" } function get_rhs(start_field) { # Get all tokens to the right of the colon in a grammar if ($start_field == "|") { start_field++ save_rhs() } # Does not yet handle { ... } embedded in rhs for (i=start_field; i < NF; i++) { word_cnt[$i]++ # how many times have we seen this word? if (rhs_str == "") rhs_str=$i else rhs_str=rhs_str " " $i } if ($NF == ";") in_rule=0 if (($NF == ";") || ($NF == "|")) save_rhs() else if (NF >= start_field) rhs_str=rhs_str " " $NF } function gen_start_rule(r_num, rule_name, num_rules) { # .dot stuff needed to encapsulate production rules gen_nest_depth=0 total_nodes=0 print "digraph rule_" r_num "{" print "fontname=helvetica ; fontsize=14" print "node[width=0.3, height=0.2]" # Name of the production print "label=\"" rule_name "\"" gen_start_to_end(num_rules) } function gen_end_rule() { # matching end to gen_start_rule print "}" print "}" } function get_new_node() { # each .dot node has a uniqu enumber total_nodes++ return total_nodes } function push_gen_nest() { # At the moment there is only one nesting level, but we have ambitions gen_nest_depth++ start_stack[gen_nest_depth]=cur_start_node end_stack[gen_nest_depth]=cur_end_node cur_start_node=get_new_node() cur_end_node=get_new_node() } function gen_start_to_end(num_rules) { # Generate a start/end node push_gen_nest() print "n__ [label=\"...\"]" gen_edge("_", ":s", cur_start_node, ":n") gen_node_right(cur_end_node, "...") # Using a subgraph cluster gives us a box around everything in the rule print "subgraph cluster_rule {" print "node[fontsize=" int((17*7)/(7+num_rules)) "]" #print "node[fontname=helvetica]" print "label=\"\"" print "n_" cur_start_node " [" print "label=\"\", color=\"white\", shape=point, height=0.01, width=0.01]" print "n_" cur_end_node " [" print "label=\"\", color=\"white\", shape=point, height=0.01, width=0.01]" } function gen_edge(from_n, from_pos, to_n, to_pos) { # Generate an edge print "n_" from_n from_pos " -> n_" to_n to_pos } function gen_node_right(from_node, new_node_str) { # as if reading right to left return gen_node(from_node, new_node_str, 1) } function gen_node_left(from_node, new_node_str, first_in_list) { # as if reading left to right return gen_node(from_node, new_node_str, 0, first_in_list) } function gen_node(from_node, new_node_str, go_right, first_in_list) { # Assume everything in upper case is a token if (new_node_str ~ /^[A-Z]+$/) new_node_str="\\\"" tolower(new_node_str) "\\\"" new_node_num=get_new_node() print "n_" new_node_num "[label=\"" new_node_str "\"]" if (go_right) gen_edge(from_node, ":s", new_node_num, ":n") else # The edge start end point can make a big difference to how the line looks if (first_in_list) gen_edge(from_node, ":w", new_node_num, ":se") else gen_edge(from_node, ":n", new_node_num, ":s") return new_node_num } function gen_loop_back(rule_name, parts, num_parts) { # A loop back occurs when a rule is self recursive, e.g., # x : y | # x comma y # we generate a "y" and then "comma" is in the loopback path np_cur_node=cur_end_node is_first=1 for (np=1; np <= num_parts; np++) { #print "lb " np " " parts[np] if (parts[np] != "") { np_cur_node=gen_node_left(np_cur_node, parts[np], is_first) is_first=0 } } # dot lays things out differently if the loopback is empty if (is_first) gen_edge(np_cur_node, ":n", cur_start_node, ":e") else gen_edge(np_cur_node, ":n", cur_start_node, ":w") } function gen_one_rule(rule_name, rule, end_node) { #print "gen one " rule_name " : " rule num_parts=split(rule, parts, " ") # TODO # If there is only one name and it is only used here, then # we should pint it here if (num_parts == 1) { if (word_cnt[parts[1]] == 1) ; } np_cur_node=cur_start_node for (np=1; np <= num_parts; np++) { np_cur_node=gen_node_right(np_cur_node, parts[np]) } if (end_node != 0) gen_edge(np_cur_node, ":s", end_node, ":n") return np_cur_node } function gen_two_rules(many_rule) { # Handle common special cases such an optional constructs # and self recursive rules rule_name=many_rule[1] rule_1=many_rule[2] rule_2=many_rule[3] #print "gen two " rule_name " : " rule_1 " | " rule_2 num_part_1=split(rule_1, part_1, " ") # an optional construct? if (num_part_1 == 0) { gen_edge(cur_start_node, ":s", cur_end_node, ":n") gen_one_rule(rule_name, rule_2, cur_end_node) return } num_part_2=split(rule_2, part_2, " ") # a left recursive rule? if ((part_2[1] == rule_name) && (part_2[num_part_2] == part_1[1])) { part_2[1]="" gen_one_rule(rule_name, part_2[num_part_2], cur_end_node) part_2[num_part_2]="" gen_loop_back(rule_name, part_2, num_part_2) return } # a right recursive rule? if ((part_2[num_part_2] == rule_name) && (part_2[1] == part_1[1])) { part_2[num_part_2]="" gen_one_rule(rule_name, part_2[1], cur_end_node) part_2[1]="" gen_loop_back(rule_name, part_2, num_part_2) return } gen_many_rules(many_rule, 3) } function gen_many_rules(many_rule, num_prod) { # Check if all rules start with the same token sequence r1_cnt=split(many_rule[2], r1_toks, " ") num_common=r1_cnt # What is the longest number of tokens? for (ri=3; ri <= num_prod; ri++) { rn_cnt=split(many_rule[ri], rn_toks, " ") for (rc=1; rc <= num_common; rc++) if (r1_toks[rc] != rn_toks[rc]) { num_common=rc-1 break } if (num_common == 0) break } if (num_common > 0) { # Generate diagram for common sequence subrule=r1_toks[1] for (ri=2; ri <= num_common; ri++) subrule=subrule " " r1_toks[ri] #print "sr " subrule new_start_node=gen_one_rule(many_rule[1], subrule, 0) cur_start_node=new_start_node # Remove common part from all rules for (ri=2; ri <= num_prod; ri++) { rn_cnt=split(many_rule[ri], rn_toks, " ") subrule=rn_toks[num_common+1] for (rc=num_common+2; rc <= rn_cnt; rc++) subrule=subrule " " rn_toks[rc] many_rule[ri]=subrule } } # Generate diagram for each rule for (ri=2; ri <= num_prod; ri++) gen_one_rule(many_rule[1], many_rule[ri], cur_end_node) } BEGIN { in_rule=0 curl_count=0 num_lhs=0 total_nodes=0 in_C_comment=0 } NF == 0 || $1 == "//" { next } in_C_comment == 1 { if ($0 ~ /\*\//) in_C_comment=0 next } $1 == "/*" { in_C_comment=1 if ($0 ~ /\*\//) in_C_comment=0 next } $1 == "{" { # handle nested {} curl_count++ next } $1 == "}" { # handle nested {} curl_count-- if (curl_count == 0) { if ($NF == ";") in_rule=0 if (($NF == ";") || ($NF == "|")) save_rhs() } next } curl_count > 0 { # skip code associated with rule next } in_rule == 1 { get_rhs(1) next } # Look for identifier following by colon $1 ~ /^[a-zA-Z_0-9]+:$/ || ($2 == ":" && $1 ~ /^[a-zA-Z_0-9]+$/) { in_rule=1 num_lhs++ sub(":", "", $1) rhs_str="" rule_name=$1 rule_rhs[rule_name]="" all_rule[num_lhs]=rule_name if ($2 == ":") get_rhs(3) else get_rhs(2) next } { next } END { for (r=1; r <= num_lhs; r++) { num_prod=split(all_rule[r], rule_parts, "|") if (num_prod == 0) continue gen_start_rule(r, rule_parts[1], num_prod) if (num_prod == 2) # rule name always followed by | gen_one_rule(rule_parts[1], rule_parts[2], cur_end_node) else if (num_prod == 3) # rule name always followed by | gen_two_rules(rule_parts) else gen_many_rules(rule_parts, num_prod) gen_end_rule() } }