Parsing JSON with
a single regex
brian d foy
Houston Perl Mongers, October 17, 2013
Mastering Perl, 2e
• Read for free now
• http://chimera.labs.oreilly.com/

books/1234000001527/index.html

• http://goo.gl/lmqAKX
• This stuff is in Chapter 2
Randal is wicked
• JSON is on a single line (minimized)
• ASCII only
• Fails very quickly
• Doesn't handle everything
• Uses many advanced regex features
• http://www.perlmonks.org/?node_id=995856
#!/usr/bin/env perl
use Data::Dumper qw(Dumper);
my $FROM_JSON = qr{
(?&VALUE) (?{ $_ = $^R->[1] })
(?(DEFINE)
(?<OBJECT>
(?{ [$^R, {}] })
{
(?: (?&KV) # [[$^R, {}], $k, $v]
(?{ # warn Dumper { obj1 => $^R };
[$^R->[0][0], {$^R->[1] => $^R->[2]}] })
(?: , (?&KV) # [[$^R, {...}], $k, $v]
(?{ # warn Dumper { obj2 => $^R };
[$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] })
)*
)?
}
)
(?<KV>
(?&STRING) # [$^R, "string"]
: (?&VALUE) # [[$^R, "string"], $value]
(?{ # warn Dumper { kv => $^R };
[$^R->[0][0], $^R->[0][1], $^R->[1]] })
)
(?<ARRAY>
(?{ [$^R, []] })
[
(?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] })
(?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R };
[$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] })
)*
)?
]
)
(?<VALUE>
s*
(
(?&STRING)
|
(?&NUMBER)
|
(?&OBJECT)
|
(?&ARRAY)
|
true (?{ [$^R, 1] })
|
false (?{ [$^R, 0] })
|
null (?{ [$^R, undef] })
)
s*
)
(?<STRING>
(
"
(?:
[^"]+
|
 ["/bfnrt]
#
|
#
 u [0-9a-fA-f]{4}
)*
"
)
(?{ [$^R, eval $^N] })
)
(?<NUMBER>
(
-?
(?: 0 | [1-9]d* )
(?: . d+ )?
(?: [eE] [-+]? d+ )?
)
(?{ [$^R, eval $^N] })
)
) }xms;
sub from_json {
local $_ = shift;
local $^R;
eval { m{A$FROM_JSONz}; } and return $_;
die $@ if $@;
return 'no match';
}
while (<>) {
chomp;
print Dumper from_json($_);
}
my $FROM_JSON = qr{
(?&VALUE) (?{ $_ = $^R->[1] })
(?(DEFINE)
(?<OBJECT>
(?{ [$^R, {}] })
{
(?: (?&KV) # [[$^R, {}], $k, $v]
(?{ # warn Dumper { obj1 => $^R };
[$^R->[0][0], {$^R->[1] => $^R->[2]}] })
(?: , (?&KV) # [[$^R, {...}], $k, $v]
(?{ # warn Dumper { obj2 => $^R };
[$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] })
)*
)?
}
)
(?<KV>
(?&STRING) # [$^R, "string"]
: (?&VALUE) # [[$^R, "string"], $value]
(?{ # warn Dumper { kv => $^R };
[$^R->[0][0], $^R->[0][1], $^R->[1]] })
)
(?<ARRAY>
(?{ [$^R, []] })
[
(?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] })
(?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R };
[$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] })
my $FROM_JSON = qr{
(?&VALUE) (?{ $_ = $^R->[1] })
(?(DEFINE)
(?<OBJECT>
(?{ [$^R, {}] })
{
(?: (?&KV) # [[$^R, {}], $k, $v]
(?{ # warn Dumper { obj1 => $^R };
[$^R->[0][0], {$^R->[1] => $^R->[2]}] })
(?: , (?&KV) # [[$^R, {...}], $k, $v]
(?{ # warn Dumper { obj2 => $^R };
[$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] })
)*
)?
}
)
(?<KV>
(?&STRING) # [$^R, "string"]
: (?&VALUE) # [[$^R, "string"], $value]
(?{ # warn Dumper { kv => $^R };
[$^R->[0][0], $^R->[0][1], $^R->[1]] })
)
(?<ARRAY>
(?{ [$^R, []] })
[
(?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] })
(?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R };
[$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] })
my $FROM_JSON = qr{
(?&VALUE) (?{ $_ = $^R->[1] })
(?(DEFINE)
(?<OBJECT>
(?{ [$^R, {}] })
{
(?: (?&KV) # [[$^R, {}], $k, $v]
(?{ # warn Dumper { obj1 => $^R };
[$^R->[0][0], {$^R->[1] => $^R->[2]}] })
(?: , (?&KV) # [[$^R, {...}], $k, $v]
(?{ # warn Dumper { obj2 => $^R };
[$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] })
)*
)?
}
)
(?<KV>
(?&STRING) # [$^R, "string"]
: (?&VALUE) # [[$^R, "string"], $value]
(?{ # warn Dumper { kv => $^R };
[$^R->[0][0], $^R->[0][1], $^R->[1]] })
)
(?<ARRAY>
(?{ [$^R, []] })
[
(?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] })
(?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R };
[$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] })
• Uses grammars: (?(DEFINE))
• Recurses: (?&KV), et alia
• Runs code during the regex: (?{ ... })
• Builds up a data structure: $^R
• At the end, replaces the string with a data
structure: (?{ $_ = $^R->[1] })
$_ =<<'HERE';
Amelia said "I am a camel"
HERE
say "Matched [$+{said}]!" if m/
( ['"] )
(?<said>.*?)
( ['"] )
/x;
$_ =<<'HERE';
Amelia said 'I am a camel'
HERE
say "Matched [$+{said}]!" if m/
( ['"] )
(?<said>.*?)
( 1 )
/x;
$_ =<<'HERE';
Amelia said 'I am a camel'
HERE
say "Matched [$+{said}]!" if m/
( ['"] )
(?<said>.*?)
(?1)
/x;
$_ =<<'HERE';
Amelia said 'I am a camel"
HERE
say "Matched [$+{said}]!" if m/
( ['"] )
(?<said>.*?)
(?1)
/x;
$_ =<<'HERE';
He said 'Amelia said "I am a camel"'
HERE
say "Matched [$+{said}]!" if m/
( ['"] )
(?<said>.*?)
(?1)
# Matches wrong quote!
/x;
$_ =<<'HERE';
He said 'Amelia said "I am a camel"'
HERE
say "Matched [$+{said}]!" if m/
(?<said>
(?<quote>['"])
(?:
[^'"]++
|
(?<said> (?1) )
)*
g{quote}
)
/x;

# $1
$_ =<<'HERE';
Out "Top 'Middle "Bottom" Middle' Out"
HERE
say "Matched [$+{said}]!" if m/
(?<said>
(?<quote>['"])
(?:
[^'"]++
|
(?R)
)*
g{quote}
)
(?{ say "Inside regex: $+{said}" })
/x;
$_ =<<'HERE';
Out "Top 'Mid "Bottom" Mid' Out"
HERE
say "Matched [$+{said}]!" if m/
(?(DEFINE)
(?<QUOTE> ['"])
(?<NOT_QUOTE> [^'"])
)
(?<said>
(?<quote>(?&QUOTE))
(?:
(?&NOT_QUOTE)++
|
(?R)
)*
g{quote}
)
(?{ say "Inside regex: $+{said}" })
/x;
my @matches;
say "Matched!" if m/
(?(DEFINE)
(?<QUOTE_MARK> ['"])
(?<NOT_QUOTE_MARK> [^'"])
)
(
(?<quote>(?&QUOTE_MARK))
(?:
(?&NOT_QUOTE_MARK)++
|
(?R)
)*
g{quote}
)
(?{ push @matches, $^N })
/x;
say "Matched!" if m/
(?(DEFINE)
(?<QUOTE_MARK> ['"])
(?<NOT_QUOTE_MARK> [^'"])
(?<QUOTE>
(
(?<quote>(?&QUOTE_MARK))
(?:
(?&NOT_QUOTE_MARK)++
|
(?&QUOTE)
)*
g{quote}
)
(?{ push @matches, $^N })
)
)
(?&QUOTE)
/x;
say "Matched!" if m/
(?(DEFINE)
(?<QUOTE_MARK> ['"])
(?<NOT_QUOTE_MARK> [^'"])
(?<QUOTE>
(
(?<quote>(?&QUOTE_MARK))
(?:
(?&NOT_QUOTE_MARK)++
|
(?&QUOTE)
)*
g{quote}
)
(?{ [ @{$^R}, $^N ] })
)
)
(?&QUOTE) (?{ @matches = @{ $^R } })
/x;

Parsing JSON with a single regex

  • 1.
    Parsing JSON with asingle regex brian d foy Houston Perl Mongers, October 17, 2013
  • 2.
    Mastering Perl, 2e •Read for free now • http://chimera.labs.oreilly.com/ books/1234000001527/index.html • http://goo.gl/lmqAKX • This stuff is in Chapter 2
  • 3.
    Randal is wicked •JSON is on a single line (minimized) • ASCII only • Fails very quickly • Doesn't handle everything • Uses many advanced regex features • http://www.perlmonks.org/?node_id=995856
  • 4.
    #!/usr/bin/env perl use Data::Dumperqw(Dumper); my $FROM_JSON = qr{ (?&VALUE) (?{ $_ = $^R->[1] }) (?(DEFINE) (?<OBJECT> (?{ [$^R, {}] }) { (?: (?&KV) # [[$^R, {}], $k, $v] (?{ # warn Dumper { obj1 => $^R }; [$^R->[0][0], {$^R->[1] => $^R->[2]}] }) (?: , (?&KV) # [[$^R, {...}], $k, $v] (?{ # warn Dumper { obj2 => $^R }; [$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] }) )* )? } ) (?<KV> (?&STRING) # [$^R, "string"] : (?&VALUE) # [[$^R, "string"], $value] (?{ # warn Dumper { kv => $^R }; [$^R->[0][0], $^R->[0][1], $^R->[1]] }) )
  • 5.
    (?<ARRAY> (?{ [$^R, []]}) [ (?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] }) (?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R }; [$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] }) )* )? ] ) (?<VALUE> s* ( (?&STRING) | (?&NUMBER) | (?&OBJECT) | (?&ARRAY) | true (?{ [$^R, 1] }) | false (?{ [$^R, 0] }) | null (?{ [$^R, undef] }) ) s* )
  • 6.
    (?<STRING> ( " (?: [^"]+ | ["/bfnrt] # | # u[0-9a-fA-f]{4} )* " ) (?{ [$^R, eval $^N] }) ) (?<NUMBER> ( -? (?: 0 | [1-9]d* ) (?: . d+ )? (?: [eE] [-+]? d+ )? ) (?{ [$^R, eval $^N] }) ) ) }xms;
  • 7.
    sub from_json { local$_ = shift; local $^R; eval { m{A$FROM_JSONz}; } and return $_; die $@ if $@; return 'no match'; } while (<>) { chomp; print Dumper from_json($_); }
  • 8.
    my $FROM_JSON =qr{ (?&VALUE) (?{ $_ = $^R->[1] }) (?(DEFINE) (?<OBJECT> (?{ [$^R, {}] }) { (?: (?&KV) # [[$^R, {}], $k, $v] (?{ # warn Dumper { obj1 => $^R }; [$^R->[0][0], {$^R->[1] => $^R->[2]}] }) (?: , (?&KV) # [[$^R, {...}], $k, $v] (?{ # warn Dumper { obj2 => $^R }; [$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] }) )* )? } ) (?<KV> (?&STRING) # [$^R, "string"] : (?&VALUE) # [[$^R, "string"], $value] (?{ # warn Dumper { kv => $^R }; [$^R->[0][0], $^R->[0][1], $^R->[1]] }) ) (?<ARRAY> (?{ [$^R, []] }) [ (?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] }) (?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R }; [$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] })
  • 9.
    my $FROM_JSON =qr{ (?&VALUE) (?{ $_ = $^R->[1] }) (?(DEFINE) (?<OBJECT> (?{ [$^R, {}] }) { (?: (?&KV) # [[$^R, {}], $k, $v] (?{ # warn Dumper { obj1 => $^R }; [$^R->[0][0], {$^R->[1] => $^R->[2]}] }) (?: , (?&KV) # [[$^R, {...}], $k, $v] (?{ # warn Dumper { obj2 => $^R }; [$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] }) )* )? } ) (?<KV> (?&STRING) # [$^R, "string"] : (?&VALUE) # [[$^R, "string"], $value] (?{ # warn Dumper { kv => $^R }; [$^R->[0][0], $^R->[0][1], $^R->[1]] }) ) (?<ARRAY> (?{ [$^R, []] }) [ (?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] }) (?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R }; [$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] })
  • 10.
    my $FROM_JSON =qr{ (?&VALUE) (?{ $_ = $^R->[1] }) (?(DEFINE) (?<OBJECT> (?{ [$^R, {}] }) { (?: (?&KV) # [[$^R, {}], $k, $v] (?{ # warn Dumper { obj1 => $^R }; [$^R->[0][0], {$^R->[1] => $^R->[2]}] }) (?: , (?&KV) # [[$^R, {...}], $k, $v] (?{ # warn Dumper { obj2 => $^R }; [$^R->[0][0], {%{$^R->[0][1]}, $^R->[1] => $^R->[2]}] }) )* )? } ) (?<KV> (?&STRING) # [$^R, "string"] : (?&VALUE) # [[$^R, "string"], $value] (?{ # warn Dumper { kv => $^R }; [$^R->[0][0], $^R->[0][1], $^R->[1]] }) ) (?<ARRAY> (?{ [$^R, []] }) [ (?: (?&VALUE) (?{ [$^R->[0][0], [$^R->[1]]] }) (?: , (?&VALUE) (?{ # warn Dumper { atwo => $^R }; [$^R->[0][0], [@{$^R->[0][1]}, $^R->[1]]] })
  • 11.
    • Uses grammars:(?(DEFINE)) • Recurses: (?&KV), et alia • Runs code during the regex: (?{ ... }) • Builds up a data structure: $^R • At the end, replaces the string with a data structure: (?{ $_ = $^R->[1] })
  • 12.
    $_ =<<'HERE'; Amelia said"I am a camel" HERE say "Matched [$+{said}]!" if m/ ( ['"] ) (?<said>.*?) ( ['"] ) /x;
  • 13.
    $_ =<<'HERE'; Amelia said'I am a camel' HERE say "Matched [$+{said}]!" if m/ ( ['"] ) (?<said>.*?) ( 1 ) /x;
  • 14.
    $_ =<<'HERE'; Amelia said'I am a camel' HERE say "Matched [$+{said}]!" if m/ ( ['"] ) (?<said>.*?) (?1) /x;
  • 15.
    $_ =<<'HERE'; Amelia said'I am a camel" HERE say "Matched [$+{said}]!" if m/ ( ['"] ) (?<said>.*?) (?1) /x;
  • 16.
    $_ =<<'HERE'; He said'Amelia said "I am a camel"' HERE say "Matched [$+{said}]!" if m/ ( ['"] ) (?<said>.*?) (?1) # Matches wrong quote! /x;
  • 17.
    $_ =<<'HERE'; He said'Amelia said "I am a camel"' HERE say "Matched [$+{said}]!" if m/ (?<said> (?<quote>['"]) (?: [^'"]++ | (?<said> (?1) ) )* g{quote} ) /x; # $1
  • 18.
    $_ =<<'HERE'; Out "Top'Middle "Bottom" Middle' Out" HERE say "Matched [$+{said}]!" if m/ (?<said> (?<quote>['"]) (?: [^'"]++ | (?R) )* g{quote} ) (?{ say "Inside regex: $+{said}" }) /x;
  • 19.
    $_ =<<'HERE'; Out "Top'Mid "Bottom" Mid' Out" HERE say "Matched [$+{said}]!" if m/ (?(DEFINE) (?<QUOTE> ['"]) (?<NOT_QUOTE> [^'"]) ) (?<said> (?<quote>(?&QUOTE)) (?: (?&NOT_QUOTE)++ | (?R) )* g{quote} ) (?{ say "Inside regex: $+{said}" }) /x;
  • 20.
    my @matches; say "Matched!"if m/ (?(DEFINE) (?<QUOTE_MARK> ['"]) (?<NOT_QUOTE_MARK> [^'"]) ) ( (?<quote>(?&QUOTE_MARK)) (?: (?&NOT_QUOTE_MARK)++ | (?R) )* g{quote} ) (?{ push @matches, $^N }) /x;
  • 21.
    say "Matched!" ifm/ (?(DEFINE) (?<QUOTE_MARK> ['"]) (?<NOT_QUOTE_MARK> [^'"]) (?<QUOTE> ( (?<quote>(?&QUOTE_MARK)) (?: (?&NOT_QUOTE_MARK)++ | (?&QUOTE) )* g{quote} ) (?{ push @matches, $^N }) ) ) (?&QUOTE) /x;
  • 22.
    say "Matched!" ifm/ (?(DEFINE) (?<QUOTE_MARK> ['"]) (?<NOT_QUOTE_MARK> [^'"]) (?<QUOTE> ( (?<quote>(?&QUOTE_MARK)) (?: (?&NOT_QUOTE_MARK)++ | (?&QUOTE) )* g{quote} ) (?{ [ @{$^R}, $^N ] }) ) ) (?&QUOTE) (?{ @matches = @{ $^R } }) /x;