-
Notifications
You must be signed in to change notification settings - Fork 2
/
emu_grammars.py
1812 lines (1451 loc) · 68.5 KB
/
emu_grammars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python3
# ecmaspeak-py/emu_grammars.py:
# Analyze <emu-grammar> elements.
#
# Copyright (C) 2018 J. Michael Dyck <[email protected]>
import atexit, subprocess, re, time, sys, pdb
from collections import namedtuple, defaultdict, OrderedDict
import DFA
import shared
from shared import stderr, msg_at_node, msg_at_posn, spec, SpecNode, decode_entities
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def do_stuff_with_emu_grammars():
stderr('do_stuff_with_emu_grammars...')
global egm_f
egm_f = shared.open_for_output('emu_grammars_misc')
emu_grammars_of_type_ = {
'definition': [],
'example' : [],
'reference' : [],
}
n_invalid = 0
for emu_grammar in spec.doc_node.each_descendant_named('emu-grammar'):
valid = parse_emu_grammar(emu_grammar)
if not valid:
n_invalid += 1
t = get_grammar_type(emu_grammar)
emu_grammars_of_type_[t].append(emu_grammar)
if n_invalid:
stderr(f"GIVING UP ON FURTHER GRAMMAR PROCESSING due to {n_invalid} emu-grammars that didn't parse")
return False
egm_log('<emu-grammar> counts:')
for (t, emu_grammars) in sorted(emu_grammars_of_type_.items()):
egm_log(' ', len(emu_grammars), t)
process_defining_emu_grammars(emu_grammars_of_type_['definition'])
check_reachability() # not that useful?
check_non_defining_prodns(emu_grammars_of_type_['reference'])
# check_order_of_RHSs_within_each_SDO_clause()
# too many complaints
check_emu_prodrefs(spec.doc_node)
approximate_annex_A()
check_nonterminal_refs(spec.doc_node)
make_grammars()
do_grammar_left_right_stuff()
egm_f.close()
return True
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def is_defining_grammar(node):
return (
node.element_name == 'emu-grammar'
and
get_grammar_type(node) == 'definition'
)
def get_grammar_type(emu_grammar):
if 'example' in emu_grammar.attrs:
return 'example'
else:
return emu_grammar.attrs.get('type', 'reference')
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def parse_emu_grammar(emu_grammar):
assert emu_grammar.element_name == 'emu-grammar'
if '\n' in emu_grammar.source_text():
# one or more productions, indented wrt the <emu-grammar> tag, separated by blank line.
goal = 'EMU_GRAMMAR_CONTENT_2'
line_start_posn = 1 + shared.spec_text.rfind('\n', 0, emu_grammar.start_posn)
emu_grammar_indent = emu_grammar.start_posn - line_start_posn
assert emu_grammar_indent in [2, 4, 6, 8, 10]
else:
# a single one-line production (no line-breaks)
goal = 'EMU_GRAMMAR_CONTENT_1'
emu_grammar_indent = None
gnode = simple_parse(
metagrammar,
goal,
emu_grammar.inner_start_posn,
emu_grammar.inner_end_posn,
emu_grammar_indent)
emu_grammar._gnode = gnode
if gnode is None:
egm_log(f" parse_emu_grammar is returning False for {emu_grammar.source_text()}")
return False
# --------------------------------------------
# Perform some checks that could have been expressed in the meta-grammar,
# but weren't.
# Also, decorate some nodes for ease of subsequent processing
gnode.preorder_traversal(decorate_misc)
if gnode.kind == 'BLOCK_PRODUCTIONS':
gnode._productions = gnode.children
elif gnode.kind == 'ONELINE_PRODUCTION':
gnode._productions = [gnode]
else:
assert 0, gnode.kind
for production_n in gnode._productions:
(gnt_n, colons_n, r_n) = production_n.children
(_, params_n, opt_n) = gnt_n.children
# --------------------------------------------
production_n._lhs_symbol = gnt_n._nt_name
# --------------------------------------------
assert params_n.kind in ['OMITTED_OPTIONAL', 'PARAMS']
# On LHS, prodn params must have no prefix.
params_n.preorder_traversal(lambda n: check_param_prefix(n, False))
# On RHS, prodn params must have a prefix.
r_n.preorder_traversal(lambda n: check_param_prefix(n, True))
production_n._param_names = [
param_n.groups[1]
for param_n in params_n.children
]
# --------------------------------------------
# LHS can't be optional.
if opt_n.source_text() != '':
msg_at_node(opt_n, "LHS cannot be optional")
# --------------------------------------------
production_n._num_colons = len(colons_n.source_text())
# --------------------------------------------
rhss = []
if production_n.kind == 'MULTILINE_PRODUCTION':
if r_n.kind == 'MULTILINE_RHSS':
# The standard case: each line is a separate RHS.
for rhs_line_n in r_n.children:
assert rhs_line_n.kind == 'RHS_LINE'
rhss.append(rhs_line_n)
# -----
(optional_guard_n, rhs_body_n, optional_label_n) = rhs_line_n.children
items = []
if optional_guard_n.source_text() != '':
items.append(optional_guard_n)
if rhs_body_n.kind == 'EMPTY':
pass
elif rhs_body_n.kind in ['U_RANGE', 'U_PROP', 'U_ANY']:
items.append(rhs_body_n)
elif rhs_body_n.kind == 'RHS_ITEMS':
items.extend(rhs_body_n.children)
else:
assert 0, rhs_body_n.kind
if optional_label_n.source_text() != '':
items.append(optional_label_n)
rhs_line_n._rhs_items = items
elif r_n.kind == 'MULTILINE_ONE_OF':
# Each backticked_thing on each line is a separate RHS.
[lines_of_backticked_things_n] = r_n.children
assert lines_of_backticked_things_n.kind == 'LINES_OF_BACKTICKED_THINGS'
for backticked_things_n in lines_of_backticked_things_n.children:
assert backticked_things_n.kind == 'BACKTICKED_THINGS'
for backticked_thing_n in backticked_things_n.children:
rhss.append(backticked_thing_n)
# -----
backticked_thing_n._rhs_items = [backticked_thing_n]
else:
assert 0
elif production_n.kind == 'ONELINE_PRODUCTION':
if r_n.kind == 'EMPTY':
rhss.append(r_n)
# -----
r_n._rhs_items = []
elif r_n.kind == 'RHS_ITEMS':
rhss.append(r_n)
# -----
r_n._rhs_items = r_n.children
elif r_n.kind == 'ONELINE_ONE_OF':
[backticked_things_n] = r_n.children
for backticked_thing_n in backticked_things_n.children:
rhss.append(backticked_thing_n)
# -----
backticked_thing_n._rhs_items = [backticked_thing_n]
else:
assert 0, r_n.kind
else:
assert 0, production_n.kind
for rhs in rhss:
rhs._reduced = reduce_rhs(rhs)
production_n._rhss = rhss
return True
def reduce_rhs(rhs_n):
pieces = []
for r_item in rhs_n._rhs_items:
if r_item.kind in [
'BACKTICKED_THING',
'NAMED_CHAR',
'U_ANY',
'U_PROP',
'U_RANGE',
]:
pieces.append(r_item.source_text())
elif r_item.kind == 'GNT':
# Drop the params
(nt_n, params_n, opt_n) = r_item.children
pieces.append(nt_n.source_text() + opt_n.source_text())
elif r_item.kind in [
'BUT_ONLY',
'BUT_NOT',
'LABEL',
'LAC_SET',
'LAC_SINGLE',
'NLTH',
'PARAMS',
]:
pass
else:
assert 0, r_item.kind
rr = ' '.join(pieces)
return rr
def decorate_misc(node):
if node.kind == 'GNT':
(nt_n, params_n, opt_n) = node.children
node._nt_name = nt_n.source_text()
node._params = [param_n.groups for param_n in params_n.children]
node._is_optional = opt_n.source_text() == '?'
return 'prune'
elif node.kind == 'BUT_NOT':
[exclusion_n] = node.children
if exclusion_n.kind == 'EXCLUDABLES':
exclusion_n._excludables = exclusion_n.children
assert len(exclusion_n._excludables) > 1
else:
exclusion_n._excludables = [exclusion_n]
elif node.kind == 'NT':
node._nt_name = node.source_text()
elif node.kind == 'BACKTICKED_THING':
node._chars = decode_entities(node.groups[0])
return 'prune'
def check_param_prefix(node, must_have_prefix):
if node.kind != 'OPTIONAL_PREFIX': return
o_p_text = node.source_text()
if o_p_text != '' and not must_have_prefix:
msg_at_node(node, "On LHS, param must not have a prefix")
elif o_p_text == '' and must_have_prefix:
msg_at_node(node, "On RHS, param must have a prefix")
return 'prune'
assert optionality.source_text() == ''
# ------------------------------------------------------------------------------
metagrammar = {
'EMU_GRAMMAR_CONTENT_1': ('_', '^', 'ONELINE_PRODUCTION', 'EOI'),
'EMU_GRAMMAR_CONTENT_2': ('_', '^', 'INDENT', 'BLOCK_PRODUCTIONS', 'OUTDENT', 'NLAI', 'EOI'),
'BLOCK_PRODUCTIONS' : ('+', 'n', 'BLOCK_PRODUCTION', r'\n'),
'BLOCK_PRODUCTION' : ('|', '^', 'MULTILINE_PRODUCTION', '_ONELINE_PRODUCTION'),
'MULTILINE_PRODUCTION' : ('_', 'n', 'OPTIONAL_COMMENT_LINE', 'NLAI', 'GNT', ' ', 'COLONS', 'MULTILINE_R'),
'OPTIONAL_COMMENT_LINE': ('?', ' ', 'NLAI', '// emu-format ignore'),
'MULTILINE_R' : ('|', '^', 'MULTILINE_ONE_OF', 'MULTILINE_RHSS'),
'MULTILINE_ONE_OF' : ('_', 'n', ' one of', 'INDENT', 'NLAI', 'LINES_OF_BACKTICKED_THINGS', 'OUTDENT'),
'LINES_OF_BACKTICKED_THINGS': ('+', 'n', 'BACKTICKED_THINGS', 'NLAI'),
'_ONELINE_PRODUCTION' : ('_', '^', 'NLAI', 'ONELINE_PRODUCTION'),
'ONELINE_PRODUCTION' : ('_', 'n', 'GNT', ' ', 'COLONS', ' ', 'ONELINE_R'),
'ONELINE_R' : ('|', '^', 'ONELINE_ONE_OF', 'RHS_BODY'),
'ONELINE_ONE_OF' : ('_', 'n', 'one of ', 'BACKTICKED_THINGS'),
'BACKTICKED_THINGS' : ('+', 'n', 'BACKTICKED_THING', ' '),
'MULTILINE_RHSS' : ('+', 'n', 'RHS_LINE', '', 'INDENT', 'OUTDENT'),
'RHS_LINE' : ('_', 'n', 'NLAI', 'OPTIONAL_GUARD', 'RHS_BODY', 'OPTIONAL_LABEL'),
'OPTIONAL_GUARD' : ('?', '^', 'PARAMS', ' '),
'OPTIONAL_LABEL' : ('?', '^', ' ', 'LABEL'),
'RHS_BODY' : ('|', '^', 'U_RANGE', 'U_PROP', 'U_ANY', 'EMPTY', 'RHS_ITEMS'),
'RHS_ITEMS' : ('+', 'n', 'RHS_ITEM', ' '),
'RHS_ITEM' : ('|', '^', 'GNT', 'BACKTICKED_THING', 'NAMED_CHAR', 'LOOKAHEAD_CONSTRAINT', 'NLTH', 'BUT_ONLY', 'BUT_NOT'),
'GNT' : ('_', 'n', 'NT', 'OPTIONAL_PARAMS', 'OPTIONAL_OPT'),
'OPTIONAL_PARAMS' : ('?', '^', 'PARAMS'),
'PARAMS' : ('+', 'n', 'PARAM', ', ', r'\[', r'\]'),
'OPTIONAL_OPT' : ('?', 'n', r'\?'),
'LOOKAHEAD_CONSTRAINT' : ('|', '^', 'LAC_SINGLE', 'LAC_SET'),
'LAC_SINGLE' : ('_', 'n', r'\[lookahead ', 'LAC_SINGLE_OP', ' ', 'TERMINAL_SEQ', r'\]'),
'LAC_SINGLE_OP' : ('/', 'n', '==|!='),
'LAC_SET' : ('_', 'n', r'\[lookahead ', 'LAC_SET_OP', ' ', 'LAC_SET_OPERAND', r'\]'),
'LAC_SET_OP' : ('/', 'n', '∈|∉'),
'LAC_SET_OPERAND' : ('|', '^', 'NT', 'SET_OF_TERMINAL_SEQ'),
'SET_OF_TERMINAL_SEQ' : ('+', 'n', 'TERMINAL_SEQ', ', ', '{ ', ' }'),
'TERMINAL_SEQ' : ('+', 'n', 'TERMINAL_ITEM', ' '),
'TERMINAL_ITEM' : ('|', '^', 'BACKTICKED_THING', 'NAMED_CHAR', 'NLTH'),
'BUT_ONLY' : ('/', 'n', r'\[> but only if ([^][]+)\]'),
'BUT_NOT' : ('_', 'n', 'but not ', 'EXCLUSION'),
'EXCLUSION' : ('|', '^', 'EXCLUDABLES', 'EXCLUDABLE'),
'EXCLUDABLES' : ('+', 'n', 'EXCLUDABLE', ' or | ', 'one of ', ''),
'EXCLUDABLE' : ('|', '^', 'NT', 'BACKTICKED_THING'),
'INDENT' : ('/', ' ', ''),
'OUTDENT' : ('/', ' ', ''),
'EOI' : ('/', ' ', ''),
'NLAI' : ('/', ' ', r'\n +'),
'COLONS' : ('/', 'n', r':+'),
'PARAM' : ('/', 'n', r'([~+?]?)([A-Z][a-zA-Z]*)'),
'NT' : ('/', 'n', r'[A-Z]\w*|uri\w*|@'),
'LABEL' : ('/', 'n', r'#\w+'),
'EMPTY' : ('/', 'n', r'\[empty\]'),
'NLTH' : ('/', 'n', r'\[no LineTerminator here\]'),
'U_RANGE' : ('/', 'n', r'> any Unicode code point in the inclusive interval from U\+([0-9A-F]+) to U\+([0-9A-F]+)'),
'U_PROP' : ('/', 'n', r'> any Unicode code point with the Unicode property “(\w+)”'),
'U_ANY' : ('/', 'n', r'> any Unicode code point'),
'BACKTICKED_THING' : ('/', 'n', r'`([^` ]+|`)`'),
'NAMED_CHAR' : ('/', 'n', r'<([A-Z]+)>'),
}
# ------------------------------------------------------------------------------
def simple_parse(grammar, goal, start_posn, end_posn, start_indent):
max_error_posn = start_posn
max_error_expectations = []
def maybe_log_expectation(posn, expectation):
nonlocal max_error_posn, max_error_expectations
if posn > max_error_posn:
max_error_posn = posn
max_error_expectations = [expectation]
elif posn == max_error_posn:
max_error_expectations.append(expectation)
t = False # shared.spec_text.startswith('\n ReservedWord', start_posn)
def attempt(goal, at_start_posn, at_start_indent, level):
# Consider shared.spec_text[at_start_posn:end_posn]
# and attempt to match some prefix of it to `goal`.
# If it doesn't match, return None.
# If it does, return a tuple containing:
# - the posn after the match.
# - the current indent after the match.
# - a GNode representing the match, or None.
_ind = ' '*level
def trace(*args):
if not t: return
print(_ind, end='')
print(*args)
trace(f"{goal}")
trace(f"At {at_start_posn} {shared.spec_text[at_start_posn:at_start_posn+20]!r}")
if goal in grammar:
(pkind, rkind, *args) = grammar[goal]
else:
assert not re.fullmatch(r'[A-Z_]+', goal), goal
pkind = '/'
rkind = ' '
args = [goal]
if pkind == '|': # alternatives
for alt in args:
r = attempt(alt, at_start_posn, at_start_indent, level+1)
if r is not None:
assert rkind == '^'
return r
# Note that this doesn't create a GNode corresponding to `goal` itself.
return None
elif pkind == '_': # concatenation
posn = at_start_posn
indent = at_start_indent
children = []
for child_goal in args:
r = attempt(child_goal, posn, indent, level+1)
if r is None: return None
(posn, indent, child) = r
if child: children.append(child)
if rkind == 'n':
result = GNode(at_start_posn, posn, goal, children)
elif rkind == '^':
# pass-up
assert len(children) == 1
[result] = children
else:
assert 0, rkind
return (posn, indent, result)
elif pkind == '?': # optional
posn = at_start_posn
indent = at_start_indent
children = []
for child_goal in args:
r = attempt(child_goal, posn, indent, level+1)
if r is None:
# We failed to find an instance of {child_goal},
# and so we've failed to find an instance of {goal}.
# I.e., the optional thing has been omitted.
# So (maybe) make a GNode to hold the representation of that absence.
if rkind == 'n':
result = GNode(at_start_posn, at_start_posn, goal, [])
elif rkind == '^':
result = GNode(at_start_posn, at_start_posn, 'OMITTED_OPTIONAL', [])
elif rkind == ' ':
result = None
else:
assert 0, rkind
return (at_start_posn, indent, result)
(posn, indent, child) = r
if child: children.append(child)
# optional thing is there
if rkind == 'n':
result = GNode(at_start_posn, posn, goal, children)
elif rkind == '^':
assert len(children) == 1
[result] = children
elif rkind == ' ':
result = None
else:
assert 0, rkind
return (posn, indent, result)
elif pkind == '+': # list of one or more
if len(args) == 2:
(element_name, separator) = args
left_delim = None; right_delim = None
elif len(args) == 4:
(element_name, separator, left_delim, right_delim) = args
else:
assert 0, args
elements = []
posn = at_start_posn
indent = at_start_indent
if left_delim:
r = attempt(left_delim, posn, indent, level+1)
if r is None:
trace("failed at left_delim")
return None
(posn, indent, _) = r
while True:
r = attempt(element_name, posn, indent, level+1)
if r is None:
if elements == []:
# This would have been the first element in the list,
# so a failure to parse it is a syntax error.
trace("failed at first element")
return None
else:
# We've already recognized some elements,
# so failure to parse another isn't necessarily a syntax error,
# it could just be that we should have stopped after the latest element,
# i.e. just before the separator.
posn = latest_sep_start_posn
break
(posn, indent, element) = r
elements.append(element)
latest_sep_start_posn = posn
r = attempt(separator, posn, indent, level+1)
if r is None: break
(posn, indent, _) = r
if right_delim:
r = attempt(right_delim, posn, indent, level+1)
if r is None:
trace("failed at right delim")
return None
(posn, indent, _) = r
node = GNode(at_start_posn, posn, goal, elements)
return (posn, indent, node)
elif pkind == '/': # regular expression
[pattern] = args
mo = re.compile(pattern).match(shared.spec_text, at_start_posn)
if mo is None:
if goal == pattern:
expectation = repr(pattern)
else:
expectation = f"{goal} {pattern!r}"
maybe_log_expectation(at_start_posn, expectation)
trace("failed to match regex")
return None
assert mo.start() == at_start_posn
at_end_posn = mo.end()
trace(f"{at_start_posn}-{at_end_posn} found {goal!r}: {shared.spec_text[at_start_posn:at_end_posn]!r}")
indent = at_start_indent
if goal == 'INDENT':
indent += 2
trace(f"indent increased to {indent}")
elif goal == 'OUTDENT':
indent -= 2
trace(f"indent decreased to {indent}")
elif goal == 'NLAI':
this_indent = at_end_posn - at_start_posn - 1
assert this_indent % 2 == 0
if this_indent != at_start_indent:
maybe_log_expectation(at_end_posn, f"indent = {at_start_indent}")
trace(f"failed to match indent = {at_start_indent}")
return None
elif goal == 'EOI':
if at_end_posn < end_posn:
maybe_log_expectation(at_end_posn, "end-of-input")
trace("failed to match eoi")
return None
if rkind == 'n':
result = GNode(mo.start(), mo.end(), goal, [])
result.groups = mo.groups()
elif rkind == ' ':
result = None
else:
assert 0, rkind
return (at_end_posn, indent, result)
else:
assert 0, pkind
# input('continue? ')
r = attempt(goal, start_posn, start_indent, 0)
if r is None:
msg_at_posn(max_error_posn, f"Syntax error: was expecting: {', '.join(max_error_expectations)}")
return None
(at_end_posn, at_end_indent, node) = r
assert at_end_posn == end_posn
assert at_end_indent == start_indent
return node
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
class GNode(SpecNode):
def __init__(self, start_posn, end_posn, kind, children):
SpecNode.__init__(self, start_posn, end_posn)
self.kind = kind
self.children = children
def __str__(self):
st = self.source_text()
snippet = st if len(st) <= 50 else (st[0:47] + '...')
return f"({self.kind} {snippet!r})"
def tree_slug(self):
return str(self)
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
info_for_nt_ = None
def process_defining_emu_grammars(emu_grammars):
egm_log('process_defining_emu_grammars...')
global info_for_nt_
grammar_f = shared.open_for_output('def_prodns')
info_for_nt_ = defaultdict(NonterminalInfo)
# Each defining production is assigned a doc_i,
# giving its index in document order.
next_doc_i = 0
for emu_grammar in emu_grammars:
# emu_grammar is an HNode (see static.py)
# representing an <emu-grammar> element
# that contains rules that *define* a chunk of the grammar
# (as opposed to merely reference it).
assert emu_grammar.attrs['type'] == 'definition'
cc_section = emu_grammar.closest_containing_section()
print(file=grammar_f)
print('#', cc_section.section_num, cc_section.section_title, file=grammar_f)
print(file=grammar_f)
print(decode_entities(trim_newlines(emu_grammar.inner_source_text())), file=grammar_f)
# stderr(cc_section.section_num, cc_section.section_title)
gnode = emu_grammar._gnode
assert gnode.kind == 'BLOCK_PRODUCTIONS'
for production_n in gnode.children:
production_n.doc_i = next_doc_i; next_doc_i += 1
defining_production_check_left(production_n, cc_section)
for emu_grammar in emu_grammars:
gnode = emu_grammar._gnode
for production_n in gnode.children:
defining_production_check_right(production_n)
if production_n._augments:
egm_log(f" augmenting {production_n._lhs_symbol}")
nt_info = info_for_nt_[production_n._lhs_symbol]
base_production_n = nt_info.get_appropriate_def_occ('A')
production_n._rhss = base_production_n._rhss + production_n._rhss
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def defining_production_check_left(production_n, cc_section):
assert production_n.kind == 'MULTILINE_PRODUCTION'
assert cc_section.element_name in ['emu-clause', 'emu-annex']
# ------------------
production_n._arena = get_grammar_arena_for_section(cc_section)
if production_n._arena == 'B':
# Some are replacements, and some are augments. Need to know which.
# Could detect it based on whether the preceding para says
# "The following augments the <Foo> production in <section-num>:"
# but easier to hard-code it:
production_n._augments = (cc_section.section_title in [
'FunctionDeclarations in IfStatement Statement Clauses',
'Initializers in ForIn Statement Heads',
])
else:
production_n._augments = False
# ------------------
# This function looks at only the LHS and colons of the production.
# ------------------
if cc_section.section_title == 'URI Syntax and Semantics':
lhs_nt_pattern = r'^uri([A-Z][a-z]+)?$'
else:
lhs_nt_pattern = r'^[A-Z][a-zA-Z0-9]+$'
assert re.match(lhs_nt_pattern, production_n._lhs_symbol), production_n._lhs_symbol
# ==============================================
if production_n._lhs_symbol not in info_for_nt_:
nt_info = NonterminalInfo()
info_for_nt_[production_n._lhs_symbol] = nt_info
# initialize nt_info with this production's data
nt_info.num_colons = production_n._num_colons
nt_info.level = 'syntactic' if nt_info.num_colons == 1 else 'lexical'
else:
nt_info = info_for_nt_[production_n._lhs_symbol]
# check that this production's data agrees with previously-extracted data
assert production_n._num_colons == nt_info.num_colons
# msg_at_posn(prodn_posn, f"ERROR: colons mismatch for {production_n._lhs_symbol}: was {nt_info.num_columns}, here {production_n._num_colons}")
assert production_n._arena not in nt_info.def_occs
# msg_at_posn(prodn_posn, f"Additional defining production for: {production_n._lhs_symbol}")
nt_info.def_occs[production_n._arena] = production_n
# ------------------------------------------------------------------------------
class NonterminalInfo:
def __init__(self):
self.def_occs = defaultdict()
# if augments:
# assert arena != 'A'
# (_, params_from_arena_a, rhss_from_arena_a) = self.def_occs['A']
# assert params == params_from_arena_a
# rhss = rhss_from_arena_a + rhss
def get_appropriate_def_occ(self, arena):
if arena in self.def_occs:
a = arena
else:
if 'A' in self.def_occs:
a = 'A'
else:
return None
return self.def_occs[a]
def check_reachability():
egm_log("check_reachability...")
queue = []
lexical_symbols = set()
def reach(symbol):
if symbol in lexical_symbols:
return
else:
lexical_symbols.add(symbol)
if symbol in queue:
pass
else:
# print(' push', symbol)
queue.append(symbol)
reach('InputElementDiv')
reach('InputElementRegExp')
reach('InputElementRegExpOrTemplateTail')
reach('InputElementTemplateTail')
reach('InputElementHashbangOrRegExp')
# For lexical invocations of ParseText()...
reach('StringNumericLiteral') # in StringToNumber()
reach('StringIntegerLiteral') # in StringToBigInt()
reach('UTCOffset') # in IsTimeZoneOffsetString() + ParseTimeZoneOffsetString()
reach('Pattern') # in ParsePattern
while queue:
symbol = queue.pop(0)
nt_info = info_for_nt_[symbol]
production_n = nt_info.def_occs['A']
for rhs_n in production_n._rhss:
for rhs_item_n in rhs_n._rhs_items:
rthing_kind = rhs_item_n.kind
if rthing_kind in ['GNT', 'NT']:
reach(rhs_item_n._nt_name)
elif rthing_kind == 'BUT_NOT':
[exclusion_n] = rhs_item_n.children
for but_n in exclusion_n._excludables:
if but_n.kind == 'NT':
reach(but_n._nt_name)
elif rthing_kind == 'LAC_SET':
[lac_set_op, lac_set_operand] = rhs_item_n.children
if lac_set_operand.kind == 'NT':
reach(lac_set_operand._nt_name)
for (nt, nt_info) in sorted(info_for_nt_.items()):
if 'A' in nt_info.def_occs and nt_info.num_colons != 1 and nt not in lexical_symbols:
egm_log(' lexical symbol not reached:', nt)
# ------------------------------------------------------------------------------
# g_current_branch_name = subprocess.check_output('git rev-parse --abbrev-ref HEAD'.split(' ')).rstrip()
def defining_production_check_right(production_n):
for (rhs_i, rhs_n) in enumerate(production_n._rhss):
if rhs_n.kind == 'RHS_LINE':
(optional_guard_n, rhs_body_n, optional_label_n) = rhs_n.children
guards = []
for param_n in optional_guard_n.children:
(prefix, param_name) = param_n.groups
assert prefix in ['+', '~']
assert param_name in production_n._param_names
guards.append( (prefix, param_name) )
# Could test that optional_label_n is unique within this production,
# but they're used so little, it's not really worth the bother?
if rhs_body_n.kind == 'RHS_ITEMS':
for rhs_item_n in rhs_body_n.children:
if rhs_item_n.kind != 'GNT':
continue
(nt_n, optional_params_n, optional_opt_n) = rhs_item_n.children
r_arg_signs = []
r_arg_names = []
for r_arg in optional_params_n.children:
(prefix, arg_name) = r_arg.groups
if prefix not in ['+', '~', '?']:
msg_at_node(r_arg,
f"ERROR: arg is missing +~?"
)
r_arg_signs.append(prefix)
r_arg_names.append(arg_name)
r_nt_name = rhs_item_n._nt_name
if r_nt_name not in info_for_nt_:
msg_at_node(nt_n,
f"ERROR: reference to undefined nonterminal 'r_nt_name'"
)
continue
d_production_n = info_for_nt_[r_nt_name].get_appropriate_def_occ(production_n._arena)
d_param_names = d_production_n._param_names
if len(r_arg_names) == len(d_param_names):
if r_arg_names != d_param_names:
msg_at_node(optional_params_n,
f"ERROR: args are ordered {r_arg_names} but should be {d_param_names}"
)
else:
msg_at_node(optional_params_n,
f"ERROR: {r_nt_name} takes {d_param_names} but is invoked with {r_arg_names}"
)
# Look for valid-but-anomalous args...
# for (r_arg_sign, r_arg_name) in zip(r_arg_signs, r_arg_names):
for r_arg in optional_params_n.children:
(prefix, arg_name) = r_arg.groups
if arg_name in production_n._param_names:
# This arg refers to a parameter that appears on the prodn's LHS.
# So normally, we'd expect a '?' prefix.
if prefix == '?':
# Good.
pass
elif (prefix, arg_name) in guards:
# This is equivalent to '?'
pass
else:
msg_at_node(r_arg,
f"WARNING: {production_n._lhs_symbol} has {arg_name} param, so you'd normally expect [?{arg_name}] in its rhss"
)
else:
# This arg refers to parameter
# that does not appear on the prodn's LHS.
# assert prefix != '?', rhs
if prefix == '?':
msg_at_node(production_n,
f"ERROR: {arg_name} does not appear on the prodn's LHS, so cannot be referenced with '?'"
)
# because you can only use '?' on the RHS
# when the parameter is 'declared' on the LHS
elif rhs_n.kind == 'BACKTICKED_THING':
# nothing to check?
pass
else:
assert 0, rhs_n.kind
# ------------------------------------------------------------------------------
def get_grammar_arena_for_section(section):
if section.section_title == 'Grammar Notation':
return 'E' # Examples
elif section.section_num.startswith('B'):
return 'B'
else:
return 'A'
# XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
def check_non_defining_prodns(emu_grammars):
egm_log("check_non_defining_prodns...")
for emu_grammar in emu_grammars:
emu_grammar.puk_set = set()
# The production(s) in this emu_grammar are (in some sense)
# instances of productions defined elsewhere,
# and we'll be comparing the two to determine if these are correct.
# To distinguish, we'll use two different prefixes:
# 'd_' for the defining production, and
# 'u_' for the 'use' production.
# (You might expect 'r_' for 'referencing',
# but I already use 'r_' for 'right-hand side'.)
cc_section = emu_grammar.closest_containing_section()
u_arena = get_grammar_arena_for_section(cc_section)
gnode = emu_grammar._gnode
lhs_nt_for_this_emu_grammar = set()
for u_production_n in gnode._productions:
assert u_production_n.kind in ['ONELINE_PRODUCTION', 'MULTILINE_PRODUCTION']
(u_gnt_n, u_colons_n, _) = u_production_n.children
(u_nt_n, u_params_n, _) = u_gnt_n.children
# -----------------------
lhs_nt = u_production_n._lhs_symbol
if lhs_nt not in info_for_nt_:
msg_at_node(u_nt_n,
f"ERROR: lhs symbol {lhs_nt} in 'use' production does not match any defined nonterminal"
)
continue
nt_info = info_for_nt_[lhs_nt]
# Disable this because too many hits:
if False and lhs_nt in lhs_nt_for_this_emu_grammar:
msg_at_node(u_nt_n,
f"{lhs_nt} already appears as a lhs symbol in this <emu-grammar>"
)
lhs_nt_for_this_emu_grammar.add(lhs_nt)
# -----------------------
u_num_colons = u_production_n._num_colons
if u_num_colons != nt_info.num_colons:
msg_at_node(u_colons_n,
f"ERROR: #colons in use ({u_num_colons}) does not match #colons in defn ({nt_info.num_colons})"
)
# -----------------------
u_param_names = u_production_n._param_names
d_production_n = nt_info.get_appropriate_def_occ(u_arena)
if d_production_n._param_names:
# The 'def' production has parameters.
if u_param_names:
# The 'use' production also shows parameters.
u_lhs_args_are_suppressed = False
if u_param_names != d_production_n._param_names:
msg_at_node(u_params_n,
f"ERROR: params in use ({u_param_names}) does not match params in defn ({d_production_n._param_names})"
)
elif cc_section.attrs['id'] in [
'sec-rules-of-automatic-semicolon-insertion',
'sec-identifiers-static-semantics-early-errors',
'sec-primary-expression',
'sec-static-semantics-template-early-errors',
'sec-arrow-function-definitions',
]:
# This is an uncommon case (~20 occurrences),
# where the 'def' production has parameters
# and the 'use' production repeats them
# (because accompanying prose needs to refer to them).
pass
else:
msg_at_node(u_params_n,
f"INFO: params in a 'use' prodn is unusual: {u_param_names}"
)
else:
# This is a typical case (~958 occurrences),
# where a 'use' production elides the parameters
# specified in the 'def' production.
u_lhs_args_are_suppressed = True
else:
# The 'def' production doesn't have parameters.
# (~430 occurrences)
u_lhs_args_are_suppressed = None
if u_param_names:
msg_at_node(u_params_n,
f"ERROR: 'use' prodn has lhs-parameters but def prodn does not"
)
# In the use-prodn, we expect rhs-args iff there are lhs-params.
# u_expect_rhs_args = len(u_prodn_params) > 0
# --------------------------
# In 'use' productions, we don't usually have annotations
u_rhss = u_production_n._rhss
for u_rhs_n in u_rhss: