Achieving Parsing
 Sanity in Erlang
       with Neotoma



         Sean Cribbs

       Web Consultant
    Ruby and Erlang Hacker
Quick Review
context-free
 grammars
Chomsky et al,
 natural langs
lots of massaging
inherent ambiguity
if A then if B then C else D

if A then if B then C else D

if A then if B then C else D
focused on
generating
parsing expression
    grammars
top-down parsing
 language (70’s)
direct
representation of
parsing functions
Brian Ford 2002
focused on
recognizing
computer
languages
parsing
expressions
e1 e2
e1 / e2
e+
e*
&e
!e
e?
“string”
 [A-Z]
    .
PEG > regexps
combined
lex+parse
choice is ordered
no ambiguity
dangling else
  obviated
greedy repetition
unlimited
  lookahead
with predicates
no left-recursion!
       (use *,+)
Parsing Techniques
Tabular
test every rule
Recursive-descent
 call & consume
Predictive
yacc/yecc
Packrat
r.d. with memo
sacrifice memory
    for speed
  linear with input length ~ 400x
supports PEGs and
   some CFGs
Treetop
 Pappy
Neotoma
Neotoma
Behind the Code TM
can:has(cukes) ->
      false.
Cucumber uses
   Treetop
PEG → leex/yecc
     FAIL
Definitions.

D = [0-9]
IDENT = [a-z|A-Z|0-9|_|-]

Rules.

_         : {token, {underscore, TokenLine, TokenChars}}.
-         : {token, {dash, TokenLine, TokenChars}}.
%         : {token, {tag_start, TokenLine, TokenChars}}.
.        : {token, {class_start, TokenLine, TokenChars}}.
#         : {token, {id_start, TokenLine, TokenChars}}.
{D}+         : {token, {number, TokenLine, list_to_integer(TokenChars)}}.
'(^.|.|[^'])*' :
  S = lists:sublist(TokenChars, 2, TokenLen - 2),
  {token, {string, TokenLine, S}}.
{IDENT}+ : {token, {chr, TokenLine, TokenChars}}.
{        : {token, {lcurly, TokenLine, TokenChars}}.
}        : {token, {rcurly, TokenLine, TokenChars}}.
[        : {token, {lbrace, TokenLine, TokenChars}}.
]        : {token, {rbrace, TokenLine, TokenChars}}.
@          : {token, {at, TokenLine, TokenChars}}.
,        : {token, {comma, TokenLine, TokenChars}}.
'        : {token, {quote, TokenLine, TokenChars}}.
:        : {token, {colon, TokenLine, TokenChars}}.
/         : {token, {slash, TokenLine, TokenChars}}.
!        : {token, {bang, TokenLine, TokenChars}}.
(        : {token, {lparen, TokenLine, TokenChars}}.
)        : {token, {rparen, TokenLine, TokenChars}}.
|        : {token, {pipe, TokenLine, TokenChars}}.
<          : {token, {lt, TokenLine, TokenChars}}.
>          : {token, {gt, TokenLine, TokenChars}}.
s+          : {token, {space, TokenLine, TokenChars}}.

Erlang code.
Rootsymbol template_stmt.

template_stmt      ->   doctype : '$1'.
template_stmt      ->   var_ref : '$1'.
template_stmt      ->   iter : '$1'.
template_stmt      ->   fun_call : '$1'.
template_stmt      ->   tag_decl : '$1'.

%% doctype selector
doctype -> bang bang         bang   : {doctype, "Transitional", []}.
doctype -> bang bang         bang   space : {doctype, "Transitional", []}.
doctype -> bang bang         bang   space doctype_name : {doctype, '$5', []}.
doctype -> bang bang         bang   space doctype_name space doctype_name : {doctype, '$5', '$7'}.

doctype_name -> doctype_name_elem doctype_name : '$1' ++ '$2'.
doctype_name -> doctype_name_elem : '$1'.

doctype_name_elem         ->   chr : unwrap('$1').
doctype_name_elem         ->   dash : "-".
doctype_name_elem         ->   class_start : ".".
doctype_name_elem         ->   number : number_to_list('$1').

%% Variable reference for emitting, iterating, and passing to funcalls
var_ref -> at name : {var_ref, unwrap('$2')}.
var_ref -> at name lbrace number rbrace : {var_ref, unwrap('$2'), unwrap('$4')}.

%% Iterator
iter -> dash space list_open iter_item list_close space lt dash space var_ref : {iter, '$4', '$10'}.

iter_list -> iter_item : ['$1'].
iter_list -> iter_item list_sep iter_list : ['$1'|'$3'].

iter_item   ->   underscore : ignore.
iter_item   ->   var_ref : '$1'.
iter_item   ->   tuple_open iter_list tuple_close: {tuple, '$2'}.
iter_item   ->   list_open iter_list list_close: {list, '$2'}.

%% Function calls
fun_call -> at name colon name params_open params_close : {fun_call, name_to_atom('$2'), name_to_atom('$4'), []}.
fun_call -> at name colon name params_open param_list params_close : {fun_call, name_to_atom('$2'), name_to_atom('$4'), '$6'}.
fun_call -> at name colon name : {fun_call, name_to_atom('$2'), name_to_atom('$4'), []}.

fun_call -> at at name colon name params_open params_close : {fun_call_env, name_to_atom('$3'), name_to_atom('$5'), []}.
fun_call -> at at name colon name params_open param_list params_close : {fun_call_env, name_to_atom('$3'), name_to_atom('$5'), '$7'}.
fun_call -> at at name colon name : {fun_call_env, name_to_atom('$3'), name_to_atom('$5'), []}.

param_list -> param : ['$1'].
parsec → eParSec
Higher Order
 Functions
functions as data
currying +
composition
HOF protocol

% A parser function
fun(Input, Index) ->
     {fail, Reason} |
     {AST, Remaining, NewIndex}.
% Implements "?" PEG operator
p_optional(P) ->
 fun(Input, Index) ->
   case P(Input, Index) of
    {fail, _} -> {[], Input, Index};
    {_,_,_} = Success -> Success
      % {Parsed, RemainingInput, NewIndex}
   end
 end.
% PEG
optional_space <- space?;


% Erlang
optional_space(Input,Index) ->
 (p_optional(fun space/2))(Input, Index).
Yay! RD!
make it memo
ets
Erlang Term
   Storage
{key, value}
key = Index
value = dict
 dict is an opaque hashtable
% Memoization wrapper
p(Inp, StartIndex, Name, ParseFun, TransformFun) ->
 % Grab the memo table from ets
 Memo = get_memo(StartIndex),
 % See if the current reduction is memoized
 case dict:find(Name, Memo) of
   % If it is, return the result
   {ok, Result} -> Result;
   % If not, attempt to parse
   _ ->
    case ParseFun(Inp, StartIndex) of
      % If it fails, memoize the failure
      {fail,_} = Failure ->
        memoize(StartIndex, dict:store(Name, Failure, Memo)),
        Failure;
      % If it passes, transform and memoize the result.
      {Result, InpRem, NewIndex} ->
        Transformed = TransformFun(Result, StartIndex),
        memoize(StartIndex, dict:store(Name, {Transformed, InpRem, NewIndex},
Memo)),
        {Transformed, InpRem, NewIndex}
    end
 end.
self-hosting
rules <- space? declaration_sequence space?;
declaration_sequence <- head:declaration tail:(space declaration)*;
declaration <- nonterminal space '<-' space parsing_expression space? ';';
parsing_expression <- choice / sequence / primary;
choice <- head:alternative tail:(space '/' space alternative)+;
alternative <- sequence / primary;
primary <- prefix atomic / atomic suffix / atomic;
sequence <- head:labeled_sequence_primary tail:(space labeled_sequence_primary)+;
labeled_sequence_primary <- label? primary;
label <- alpha_char alphanumeric_char* ':';
suffix <- repetition_suffix / optional_suffix;
optional_suffix <- '?';
repetition_suffix <- '+' / '*';
prefix <- '&' / '!';
atomic <- terminal / nonterminal / parenthesized_expression;
parenthesized_expression <- '(' space? parsing_expression space? ')';
nonterminal <- alpha_char alphanumeric_char*;
terminal <- quoted_string / character_class / anything_symbol;
quoted_string <- single_quoted_string / double_quoted_string;
double_quoted_string <- '"' string:(!'"' ("" / '"' / .))* '"';
single_quoted_string <- "'" string:(!"'" ("" / "'" / .))* "'";
character_class <- '[' characters:(!']' ('' . / !'' .))+ ']
anything_symbol <- '.';
alpha_char <- [a-z_];
alphanumeric_char <- alpha_char / [0-9];
space <- (white / comment_to_eol)+;
comment_to_eol <- '%' (!"n" .)*;
white <- [ tnr];
parse_transform
   f(AST) -> NewAST.
standalone,
code-generation
Future directions
inline code in PEG
atomic <- terminal / nonterminal / parenthesized_expression
"""
% Params: Node, Idx
case Node of
    {'nonterminal', Symbol} ->
       add_nt(Symbol, Idx),
       "fun '" ++ Symbol ++ "'/2";
    Any -> Any
end
"""
'atomic'(Input, Index) ->
  p(Input, Index, 'atomic',
 fun(I,D) ->
  (p_choose([
      fun 'terminal'/2,
      fun 'nonterminal'/2,
      fun 'parenthesized_expression'/2]))(I,D) end,
 fun(Node, Idx) ->
  case Node of
    {'nonterminal', Symbol} ->
       add_nt(Symbol, Idx),
       "fun '" ++ Symbol ++ "'/2";
    Any -> Any
  end
 end).
process dictionary
      BAD
Reia
retem
sedate
Demo
http://github.com/seancribbs/neotoma
questions?

Achieving Parsing Sanity In Erlang

  • 1.
    Achieving Parsing Sanityin Erlang with Neotoma Sean Cribbs Web Consultant Ruby and Erlang Hacker
  • 2.
  • 3.
  • 4.
    Chomsky et al, natural langs
  • 5.
  • 6.
  • 7.
    if A thenif B then C else D if A then if B then C else D if A then if B then C else D
  • 8.
  • 9.
  • 10.
  • 11.
  • 12.
  • 13.
  • 14.
  • 15.
  • 16.
  • 17.
  • 18.
  • 19.
  • 20.
  • 21.
  • 22.
  • 23.
  • 24.
  • 25.
  • 26.
  • 27.
  • 28.
  • 29.
  • 30.
  • 31.
  • 32.
  • 33.
  • 34.
  • 35.
  • 36.
  • 37.
    sacrifice memory for speed linear with input length ~ 400x
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
    Definitions. D = [0-9] IDENT= [a-z|A-Z|0-9|_|-] Rules. _ : {token, {underscore, TokenLine, TokenChars}}. - : {token, {dash, TokenLine, TokenChars}}. % : {token, {tag_start, TokenLine, TokenChars}}. . : {token, {class_start, TokenLine, TokenChars}}. # : {token, {id_start, TokenLine, TokenChars}}. {D}+ : {token, {number, TokenLine, list_to_integer(TokenChars)}}. '(^.|.|[^'])*' : S = lists:sublist(TokenChars, 2, TokenLen - 2), {token, {string, TokenLine, S}}. {IDENT}+ : {token, {chr, TokenLine, TokenChars}}. { : {token, {lcurly, TokenLine, TokenChars}}. } : {token, {rcurly, TokenLine, TokenChars}}. [ : {token, {lbrace, TokenLine, TokenChars}}. ] : {token, {rbrace, TokenLine, TokenChars}}. @ : {token, {at, TokenLine, TokenChars}}. , : {token, {comma, TokenLine, TokenChars}}. ' : {token, {quote, TokenLine, TokenChars}}. : : {token, {colon, TokenLine, TokenChars}}. / : {token, {slash, TokenLine, TokenChars}}. ! : {token, {bang, TokenLine, TokenChars}}. ( : {token, {lparen, TokenLine, TokenChars}}. ) : {token, {rparen, TokenLine, TokenChars}}. | : {token, {pipe, TokenLine, TokenChars}}. < : {token, {lt, TokenLine, TokenChars}}. > : {token, {gt, TokenLine, TokenChars}}. s+ : {token, {space, TokenLine, TokenChars}}. Erlang code.
  • 45.
    Rootsymbol template_stmt. template_stmt -> doctype : '$1'. template_stmt -> var_ref : '$1'. template_stmt -> iter : '$1'. template_stmt -> fun_call : '$1'. template_stmt -> tag_decl : '$1'. %% doctype selector doctype -> bang bang bang : {doctype, "Transitional", []}. doctype -> bang bang bang space : {doctype, "Transitional", []}. doctype -> bang bang bang space doctype_name : {doctype, '$5', []}. doctype -> bang bang bang space doctype_name space doctype_name : {doctype, '$5', '$7'}. doctype_name -> doctype_name_elem doctype_name : '$1' ++ '$2'. doctype_name -> doctype_name_elem : '$1'. doctype_name_elem -> chr : unwrap('$1'). doctype_name_elem -> dash : "-". doctype_name_elem -> class_start : ".". doctype_name_elem -> number : number_to_list('$1'). %% Variable reference for emitting, iterating, and passing to funcalls var_ref -> at name : {var_ref, unwrap('$2')}. var_ref -> at name lbrace number rbrace : {var_ref, unwrap('$2'), unwrap('$4')}. %% Iterator iter -> dash space list_open iter_item list_close space lt dash space var_ref : {iter, '$4', '$10'}. iter_list -> iter_item : ['$1']. iter_list -> iter_item list_sep iter_list : ['$1'|'$3']. iter_item -> underscore : ignore. iter_item -> var_ref : '$1'. iter_item -> tuple_open iter_list tuple_close: {tuple, '$2'}. iter_item -> list_open iter_list list_close: {list, '$2'}. %% Function calls fun_call -> at name colon name params_open params_close : {fun_call, name_to_atom('$2'), name_to_atom('$4'), []}. fun_call -> at name colon name params_open param_list params_close : {fun_call, name_to_atom('$2'), name_to_atom('$4'), '$6'}. fun_call -> at name colon name : {fun_call, name_to_atom('$2'), name_to_atom('$4'), []}. fun_call -> at at name colon name params_open params_close : {fun_call_env, name_to_atom('$3'), name_to_atom('$5'), []}. fun_call -> at at name colon name params_open param_list params_close : {fun_call_env, name_to_atom('$3'), name_to_atom('$5'), '$7'}. fun_call -> at at name colon name : {fun_call_env, name_to_atom('$3'), name_to_atom('$5'), []}. param_list -> param : ['$1'].
  • 46.
  • 47.
  • 48.
  • 49.
  • 50.
    HOF protocol % Aparser function fun(Input, Index) -> {fail, Reason} | {AST, Remaining, NewIndex}.
  • 51.
    % Implements "?"PEG operator p_optional(P) -> fun(Input, Index) -> case P(Input, Index) of {fail, _} -> {[], Input, Index}; {_,_,_} = Success -> Success % {Parsed, RemainingInput, NewIndex} end end.
  • 52.
    % PEG optional_space <-space?; % Erlang optional_space(Input,Index) -> (p_optional(fun space/2))(Input, Index).
  • 53.
  • 54.
  • 55.
  • 56.
  • 57.
    value = dict dict is an opaque hashtable
  • 58.
    % Memoization wrapper p(Inp,StartIndex, Name, ParseFun, TransformFun) -> % Grab the memo table from ets Memo = get_memo(StartIndex), % See if the current reduction is memoized case dict:find(Name, Memo) of % If it is, return the result {ok, Result} -> Result; % If not, attempt to parse _ -> case ParseFun(Inp, StartIndex) of % If it fails, memoize the failure {fail,_} = Failure -> memoize(StartIndex, dict:store(Name, Failure, Memo)), Failure; % If it passes, transform and memoize the result. {Result, InpRem, NewIndex} -> Transformed = TransformFun(Result, StartIndex), memoize(StartIndex, dict:store(Name, {Transformed, InpRem, NewIndex}, Memo)), {Transformed, InpRem, NewIndex} end end.
  • 59.
  • 60.
    rules <- space?declaration_sequence space?; declaration_sequence <- head:declaration tail:(space declaration)*; declaration <- nonterminal space '<-' space parsing_expression space? ';'; parsing_expression <- choice / sequence / primary; choice <- head:alternative tail:(space '/' space alternative)+; alternative <- sequence / primary; primary <- prefix atomic / atomic suffix / atomic; sequence <- head:labeled_sequence_primary tail:(space labeled_sequence_primary)+; labeled_sequence_primary <- label? primary; label <- alpha_char alphanumeric_char* ':'; suffix <- repetition_suffix / optional_suffix; optional_suffix <- '?'; repetition_suffix <- '+' / '*'; prefix <- '&' / '!'; atomic <- terminal / nonterminal / parenthesized_expression; parenthesized_expression <- '(' space? parsing_expression space? ')'; nonterminal <- alpha_char alphanumeric_char*; terminal <- quoted_string / character_class / anything_symbol; quoted_string <- single_quoted_string / double_quoted_string; double_quoted_string <- '"' string:(!'"' ("" / '"' / .))* '"'; single_quoted_string <- "'" string:(!"'" ("" / "'" / .))* "'"; character_class <- '[' characters:(!']' ('' . / !'' .))+ '] anything_symbol <- '.'; alpha_char <- [a-z_]; alphanumeric_char <- alpha_char / [0-9]; space <- (white / comment_to_eol)+; comment_to_eol <- '%' (!"n" .)*; white <- [ tnr];
  • 61.
    parse_transform f(AST) -> NewAST.
  • 62.
  • 63.
  • 64.
  • 65.
    atomic <- terminal/ nonterminal / parenthesized_expression """ % Params: Node, Idx case Node of {'nonterminal', Symbol} -> add_nt(Symbol, Idx), "fun '" ++ Symbol ++ "'/2"; Any -> Any end """
  • 66.
    'atomic'(Input, Index) -> p(Input, Index, 'atomic', fun(I,D) -> (p_choose([ fun 'terminal'/2, fun 'nonterminal'/2, fun 'parenthesized_expression'/2]))(I,D) end, fun(Node, Idx) -> case Node of {'nonterminal', Symbol} -> add_nt(Symbol, Idx), "fun '" ++ Symbol ++ "'/2"; Any -> Any end end).
  • 67.
  • 68.
  • 69.
  • 70.