Successfully reported this slideshow.
Upcoming SlideShare
×

# Parse Everything With Elixir

2,946 views

Published on

Paco: Parser combinator library for Elixir. The presentation contains many examples of increasing complexity.

Published in: Technology
• Full Name
Comment goes here.

Are you sure you want to Yes No
• Be the first to comment

### Parse Everything With Elixir

1. 1. elixir
2. 2. ! " #
3. 3.
4. 4. Interactive Elixir (1.1.0) iex(1)> import Paco nil iex(2)> import Paco.Parser nil iex(3)> parse("a", lit("a")) {:ok, "a"} iex(4)> parse("a", lit("a"), format: :raw) %Paco.Success{ from: {0, 1, 1}, to: {0, 1, 1}, at: {1, 1, 2}, result: "a", tail: "", ...}
5. 5. iex(5)> parse("aaa", lit("aaa")) {:ok, "aaa"} iex(6)> "aaa" |> parse(lit("aaa")) {:ok, "aaa"} iex(7)> "aaa" |> parse(lit("a")) {:ok, "a"} iex(8)> "aaa" |> parse(lit("a"), format: :raw) %Paco.Success{ from: {0, 1, 1}, to: {0, 1, 1}, at: {1, 1, 2}, result: "a", tail: "aa", ...}
6. 6. iex(9)> "b" |> parse(lit("a")) {:error, "expected "a" at 1:1 but got "b""} iex(10)> "b" |> parse(lit("a"), format: :raw) %Paco.Failure{ at: {0, 1, 1}, expected: "a", tail: "b", ...}
7. 7. iex(1)> "aaa" |> parse(any) {:ok, "a"} iex(2)> "aaa" |> parse(any(1)) {:ok, "a"} iex(3)> "aaa" |> parse(any(2)) {:ok, "aa"} iex(4)> "a" |> parse(any(2)) {:error, "expected exactly 2 characters at 1:1 but got "a""} iex(5)> "aaa" |> parse(any(at_least: 2)) {:ok, "aaa"} iex(6)> "aaa" |> parse(any(at_most: 2)) {:ok, "aa"}
8. 8. iex(1)> "bbabcd" |> parse(while("abc")) {:ok, "bbabc"} iex(2)> "xxx" |> parse(while("abc")) {:ok, ""} iex(3)> "xxx" |> parse(while("abc", at_least: 2)) {:error, "expected at least 2 characters in alphabet "abc" at 1:1 but got "xx""} iex(4)> import Paco.ASCII, only: [lowercase?: 1] iex(5)> "abCD" |> parse(while(&lowercase?/1)) {:error, "ab"} iex(6)> "abCD" |> parse(while(&lowercase?/1, at_least: 3)) {:error, "expected at least 3 lowercase characters at 1:1 but got "abC""}
9. 9. iex(1)> "abc" |> parse(until("c")) {:ok, "ab"} iex(2)> "abcdc" |> parse(until("c", escaped_with: "")) {:ok, "abcd"} iex(3)> "abcdc" |> parse(until("c", escaped_with: "", keep_escape: true)) {:ok, "abcd"} iex(4)> "abc" |> parse(until("d")) {:error, "expected something ended by "d" at 1:1 but got "abc""} iex(5)> "abc" |> parse(until("d"), eof: true) {:error, "abc"}
10. 10. iex(1)> "ab" |> parse(sequence_of([lit("a"), lit("b")])) {:ok, ["a", "b"]} iex(2)> "ac" |> parse(sequence_of([lit("a"), lit("b")])) {:error, "expected "b" at 1:2 but got "c""}} iex(3)> ab = sequence_of([lit("a"), lit("b")]) %Paco.Parser{...} iex(4)> "abc" |> parse(sequence_of([ab, lit("c")])) {:ok, [["a", "b"], "c"]} iex(5)> "xxx" |> parse(sequence_of([ab, lit("c")])) {:error, "expected "a" at 1:1 but got "x""} iex(6)> "axx" |> parse(sequence_of([ab, lit("c")])) {:error, "expected "b" at 1:2 but got "x""} iex(7)> "abx" |> parse(sequence_of([ab, lit("c")])) {:error, "expected "c" at 1:3 but got "x""}
11. 11. iex(1)> "a" |> parse(one_of([lit("a"), lit("b")])) {:ok, "a"} iex(2)> "b" |> parse(one_of([lit("a"), lit("b")])) {:ok, "b"} iex(4)> # farthest failure (higher rank) wins nil iex(3)> "ab" |> parse(one_of([lit("ac"), lit("bc")])) {:error, "expected "ac" at 1:1 but got "ab""} iex(6)> # failures with same rank are composed nil iex(5)> "ab" |> parse(one_of([lit("ac"), lit("ad")])) {:error, "expected one of ["ac", "ad"] at 1:1 but got "ab""}
12. 12. iex(1)> "aaa" |> parse(repeat(lit("a"))) {:ok, ["a", "a", "a"]} iex(2)> "aaa" |> parse(repeat(lit("a"), 2)) {:ok, ["a", "a"]} iex(4)> "aaa" |> parse(repeat(lit("a"), at_most: 2)) {:ok, ["a", "a"]} iex(3)> "aaa" |> parse(repeat(lit("a"), at_least: 4)) {:error, ""expected "a" at 1:2 but got the end of input"} iex(6)> "abba" |> parse(repeat(one_of([lit("a"), lit("b")]))) {:ok, ["a", "b", "b", "a"]}
13. 13. defmodule Paco.ASCII do ! @upper ["A","B","C","D","E",...,"Z"] ! @classes [... {:upper, :upper?, @upper}, ...] for {class, is_class, elements} <- @classes do def unquote(class)(), do: unquote(elements) for element <- elements do def unquote(is_class)(<<unquote(element)>>), do: true end def unquote(is_class)(_), do: false end ! # def upper, do: @upper # def upper?("A"), do: true # def upper?("B"), do: true # ... # def upper?(_), do: false
14. 14. ws = while(ASCII.ws) ! hello = lit("Hello") separator = sequence_of([ws, lit(","), ws]) what = while(ASCII.letter, at_least: 1) terminator = sequence_of([ws, lit("!")]) ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello,World!", greetings) |> IO.inspect # {:ok, ["Hello", ["", ",", ""], "World", ["", "!"]]} ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["Hello", ["", ",", " "], "BEAM", ["", "!"]]}
15. 15. # Good, not great: skip everything that is not interesting ! ws = while(ASCII.ws) ! hello = lit("Hello") |> skip separator = sequence_of([ws, lit(","), ws]) |> skip what = while(ASCII.letter, at_least: 1) terminator = sequence_of([ws, lit("!")]) |> skip ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello,World!", greetings) |> IO.inspect # {:ok, ["World"]} ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]}
16. 16. # Not everyone are so loud, `!` should be optional ! ws = while(ASCII.ws) ! hello = lit("Hello") |> skip separator = sequence_of([ws, lit(","), ws]) |> skip what = while(ASCII.letter, at_least: 1) terminator = sequence_of([ws, lit("!")]) |> maybe ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]} ! parse("Hello, BEAM", greetings) |> IO.inspect # {:ok, ["BEAM"]}
17. 17. # Let's get rid of non significant whitespaces with lex(s) ! # In module Paco.Parser... ! parser lex(s), as: lit(s) |> surrounded_by(maybe(whitespaces)) ! parser surrounded_by(p, left, right), as: sequence_of([skip(left), p, skip(right)])
18. 18. # Use lex Luke! ! ws = while(ASCII.ws) ! hello = lit("Hello") |> skip what = while(ASCII.letter, at_least: 1) separator = lex(",") |> skip terminator = lex("!") |> maybe ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]} ! parse("Hello, BEAM", greetings) |> IO.inspect # {:ok, ["BEAM"]}
19. 19. # It's common to have something non significant # that follows or precedes something significant ! # In module Paco.Parser... ! parser followed_by(p, right), as: sequence_of([p, skip(right)]) ! parser preceded_by(p, right), as: sequence_of([skip(left), p])
20. 20. # An alternative and shorter version ! what = while(ASCII.letter, at_least: 1) ! greetings = what |> preceded_by(lit("Hello") |> followed_by(lex(","))) |> followed_by(maybe(lex("!"))) ! ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]} ! parse("Hello, BEAM", greetings) |> IO.inspect # {:ok, ["BEAM"]}
21. 21. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(operator) ! parse("1", expression) |> IO.inspect # {:ok, ["1"]} ! parse("1 + 2", expression) |> IO.inspect # {:ok, ["1", "2"]} ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, ["1", "2", "3"]} ! # Small problem... to compute the value we need the operators!
22. 22. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(keep(operator)) ! parse("1", expression) |> IO.inspect # {:ok, ["1"]} ! parse("1 + 2", expression) |> IO.inspect # {:ok, ["1", "+", "2"]} ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, ["1", "+", "2", "-", "3"]} ! # Ok, but we need numbers not strings
23. 23. # In module Paco.Parser... parser bind(p, f) do fn state, _ -> case p.parse.(state, p) do %Success{result: result} = success -> case f.(result, success) do %Failure{} = failure -> failure %Success{} = success -> success result -> %Success{success|result: result} end %Failure{} = failure -> failure end end end
24. 24. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(keep(operator)) ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, [1, "+", 2, "-", 3]} ! # Missing only the last step... compute the result :-)
25. 25. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(keep(operator)) |> bind(&Paco.Transform.separated_by(&1, fn("+", n, m) -> n + m ("-", n, m) -> n - m end)) ! ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, 0]}
26. 26. # Parse a{n}b{n}c{n} where n ∈ ℕ ! # If you knew the `n` (ex. 3) it would be easy ! p = sequence_of([while("a", 3), while("b", 3), while("c", 3)]) ! parse("aaabbbccc", p) |> IO.inspect # {:ok, ["aaa", "bbb", "ccc"]} ! # We need to be able to peek ahead and then create a parser # with that knowledge
27. 27. # In module Paco.Parser... ! parser peek(box(p)) do fn %State{at: at, text: text} = state, _ -> case p.parse.(state, p) do %Success{result: result} -> %Success{from: at, to: at, at: at, tail: text, result: result} %Failure{} = failure -> failure end end end
28. 28. # In module Paco.Parser... ! parser then(p, f) when is_function(f), as: bind(p, f) |> bind(fn(p, _, s) -> p.parse.(s, p) end)
29. 29. # Parse a{n}b{n}c{n} where n ∈ ℕ ! p = peek(while("a")) |> then(fn(a) -> len = String.length(a) sequence_of([while("a", len), while("b", len), while("c", len)]) end) ! parse("aaabbbccc", p) |> IO.inspect # {:ok, ["aaa", "bbb", "ccc"]} ! parse("aaabbccc", p) |> IO.inspect # {:error, "expected exactly 3 characters in alphabet "b" at 1:4 but got "bbc""}
30. 30. # An `element` is a word beginning with one uppercase letter # followed by zero or more lowercase letters element = sequence_of([while(ASCII.upper, 1), while(ASCII.lower)]) ! # A `quantity` is a number greater than zero # If the quantity is omitted assume the value of 1 as default quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! # A `reference` is an element optionally followed by a quantity reference = sequence_of([element, quantity]) ! formula = repeat(reference, at_least: 1)
31. 31. parse("H2O", formula) |> IO.inspect # {:ok, [[["H", ""], 2], [["O", ""], 1]]} ! # That's right but the output format sucks! ! # What we really want is something like # {:ok, [%{element: "H", quantity: 2}, %{element: "0", quantity: 1}] ! # Is that possible???
32. 32. defprotocol Paco.Parsable do @moduledoc """ A protocol that converts terms into Paco parsers """ @fallback_to_any true @doc """ Returns a parser that parses `t` and keeps the shape of `t` """ @spec to_parser(t) :: Paco.Parser.t def to_parser(t) end
33. 33. defimpl Paco.Parsable, for: BitString do import Paco.Parser def to_parser(s) when is_binary(s) do lit(s) end def to_parser(s) do raise Protocol.UndefinedError, protocol: @protocol, value: s end end iex(1)> "aaa" |> parse(lit("aaa")) {:ok, "aaa"} iex(2)> "aaa" |> parse("aaa") {:ok, "aaa"}
34. 34. defimpl Paco.Parsable, for: List do import Paco.Parser def to_parser(l) do sequence_of(l) end end iex(1)> "ab" |> parse(sequence_of([lit("a"), lit("b")])) {:ok, ["a", "b"]} iex(2)> "ab" |> parse(sequence_of(["a", "b"])) {:ok, ["a", "b"]} iex(3)> "ab" |> parse(["a", "b"]) {:ok, ["a", "b"]}
35. 35. defimpl Paco.Parsable, for: Tuple do import Paco.Parser def to_parser(tuple) do sequence_of(Tuple.to_list(tuple)) |> bind(&List.to_tuple/1) end end iex(1)> "ab" |> parse({"a", "b"})) {:ok, {"a", "b"}}
36. 36. defimpl Paco.Parsable, for: Map do import Paco.Parser def to_parser(tuple) do {keys, values} = {Map.keys(map), Map.values(map)} sequence_of(values) |> bind(&(Enum.zip(keys, &1) |> Enum.into(Map.new))) end end iex(1)> "ab" |> parse(%{first: "a", last: "b"})) {:ok, %{first: "a", last: "b"}}
37. 37. element = [while(ASCII.upper, 1), while(ASCII.lower)] ! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! reference = %{element: element, quantity: quantity} ! formula = repeat(reference, at_least: 1) ! parse("H2O", formula) |> IO.inspect # {:ok, [%{element: ["H", ""], quantity: 2}, # %{element: ["O", ""], quantity: 1}]} ! # Almost...
38. 38. # parser join(p, joiner ""), # as: bind(p, &Enum.join(&1, joiner)) ! element = [while(ASCII.upper, 1), while(ASCII.lower)] |> join ! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! reference = %{element: element, quantity: quantity} ! formula = repeat(reference, at_least: 1) ! parse("H2O", formula) |> IO.inspect # {:ok, [%{element: "H", quantity: 2}, # %{element: "O", quantity: 1}]} ! # Yahoooo!!!
39. 39. element = [while(ASCII.upper, 1), while(ASCII.lower)] |> join ! # Bub a `quantity` is a number greater than zero! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! reference = %{element: element, quantity: quantity} ! formula = repeat(reference, at_least: 1) ! parse("Na0", formula) |> IO.inspect # {:ok, [%{element: "Na", quantity: 0}]} ! # Ouch...
40. 40. # ... # A `quantity` is a number greater than zero quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) |> only_if(&(&1 > 0)) ! # ... ! parse("Na0", formula) |> IO.inspect # {:error, "0 is not acceptable at 1:3"}
41. 41. # ... # A `quantity` is a number greater than zero ! error_message = "quantity must be greather than 0 %AT%" ! greater_than_zero = &{&1 > 0, error_message} ! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) |> only_if(greater_than_zero)) ! # ... ! parse("Na0", formula) |> IO.inspect # {:error, "quantity must be greather than 0 at 1:3"}
42. 42. # Parse something like `(1, (2, 3))` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! # We need to name something that is not yet defined, # actually we need to name something in its definition ! list = one_of([number, ???]) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.round_brackets)
43. 43. # In module Paco.Parser... ! parser recursive(f) do fn state, this -> box(f.(this)).parse.(state, this) end end
44. 44. # Parse something like `(1, (2, 3))` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! list = recursive(fn(list) -> one_of([number, list]) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.round_brackets) end) ! parse("(1, 2)", list) |> IO.inspect # {:ok, [1, 2]} ! parse("(1, (2, 3))", list) |> IO.inspect # {:ok, [1, [2, 3]]}
45. 45. defmodule ListOfLists do use Paco alias Paco.ASCII ! parser number do while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) end ! parser list do one_of([number, list]) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.round_brackets) end end ! Paco.parse("1", ListOfLists.number) |> IO.inspect # {:ok, 1}
46. 46. # In module Paco... ! defmacro __using__(_) do quote do import Paco.Macro.ParserModuleDefinition import Paco.Parser ! Module.register_attribute(__MODULE__, :paco_parsers, accumulate: true) ! @before_compile Paco end end
47. 47. # In module Paco... ! defmacro __before_compile__(env) do root_parser = pick_root_parser_between( Module.get_attribute(env.module, :paco_parsers) |> Enum.reverse ) ! quote do def parse(s, opts []) do Paco.parse(s, apply(__MODULE__, unquote(root_parser), []), opts) end ! def parse!(s, opts []) do Paco.parse!(s, apply(__MODULE__, unquote(root_parser), []), opts) end end end
48. 48. # Everything we saw until now works with streams of text! ! ["a", "b", "", "ab", "", "a", "", "", "b", "", ""] |> Paco.Stream.parse(lit("ab")) |> Enum.to_list |> IO.inspect # ["ab", "ab", "ab"] ! [~s|{"foo|, ~s|": "bar"|, ~s|}[1, 2|, ~s|, 3]|] |> Paco.Parser.JSON.stream |> Enum.to_list |> IO.inspect # [%{"foo" => "bar"}, [1, 2, 3]]
49. 49. parser lit(s) do fn %State{at: from, text: text, stream: stream} = state, this -> case Paco.String.consume(text, s, from) do {tail, _, to, at} -> %Success{from: from, to: to, at: at, tail: tail, result: s} ! {:not_enough, _, _, _, _} when is_pid(stream) -> wait_for_more_and_continue(state, this)  {_, _, _, _, {n, _, _}} -> %Failure{at: from, tail: text, expected: s, rank: n+1} end end end !
50. 50. defp wait_for_more_and_continue(state, this) do %State{text: text, stream: stream} = state send(stream, {self, :more}) receive do {:load, more_text} -> this.parse.(%State{state|text: text <> more_text}, this) :halted -> # The stream is over, switching to a non stream mode # is the same as to tell the parser to behave knowing # that more input will never come this.parse.(%State{state|stream: nil}, this) end end
51. 51. defmodule Paco.Parser.JSON do alias Paco.ASCII use Paco ! root parser all, do: one_of([object, array]) ! parser object do pair(string, value, separated_by: ASCII.colon) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.curly_brackets) |> bind(&to_map/1) end ! parser array do value |> separated_by(ASCII.comma) |> surrounded_by(ASCII.square_brackets) end # ...
52. 52. defmodule Paco.Parser.JSON do # ... ! parser value do one_of([ string, number, object, array, literal_true, literal_false, literal_null]) end ! parser string do between(ASCII.double_quotes, escaped_with: "", strip: false) |> bind(&replace_escapes_in_string/1) end # ...
53. 53. defmodule Paco.Parser.JSON do # ... ! parser literal_true, do: lit("true") |> replace_with(true) parser literal_false, do: lit("false") |> replace_with(false) parser literal_null, do: lit("null") |> replace_with(nil) ! # ... end
54. 54. Settings: duration: 1.0 s ! ## Paco.Benchmark.JSON [00:21:14] 1/4: poison small [00:21:16] 2/4: poison medium [00:21:18] 3/4: paco small [00:21:21] 4/4: paco medium ! Finished in 8.78 seconds ! ## Paco.Benchmark.JSON poison small 100000 14.72 µs/op poison medium 10000 144.58 µs/op paco small 5000 493.32 µs/op paco medium 500 4152.14 µs/op
55. 55. ✓ ✓ ☕ ☕ 👎 \$