Successfully reported this slideshow.
We use your LinkedIn profile and activity data to personalize ads and to show you more relevant ads. You can change your ad preferences anytime.

Parse Everything With Elixir

2,946 views

Published on

Paco: Parser combinator library for Elixir. The presentation contains many examples of increasing complexity.

Published in: Technology
  • Be the first to comment

Parse Everything With Elixir

  1. 1. elixir
  2. 2. ! " #
  3. 3.
  4. 4. Interactive Elixir (1.1.0) iex(1)> import Paco nil iex(2)> import Paco.Parser nil iex(3)> parse("a", lit("a")) {:ok, "a"} iex(4)> parse("a", lit("a"), format: :raw) %Paco.Success{ from: {0, 1, 1}, to: {0, 1, 1}, at: {1, 1, 2}, result: "a", tail: "", ...}
  5. 5. iex(5)> parse("aaa", lit("aaa")) {:ok, "aaa"} iex(6)> "aaa" |> parse(lit("aaa")) {:ok, "aaa"} iex(7)> "aaa" |> parse(lit("a")) {:ok, "a"} iex(8)> "aaa" |> parse(lit("a"), format: :raw) %Paco.Success{ from: {0, 1, 1}, to: {0, 1, 1}, at: {1, 1, 2}, result: "a", tail: "aa", ...}
  6. 6. iex(9)> "b" |> parse(lit("a")) {:error, "expected "a" at 1:1 but got "b""} iex(10)> "b" |> parse(lit("a"), format: :raw) %Paco.Failure{ at: {0, 1, 1}, expected: "a", tail: "b", ...}
  7. 7. iex(1)> "aaa" |> parse(any) {:ok, "a"} iex(2)> "aaa" |> parse(any(1)) {:ok, "a"} iex(3)> "aaa" |> parse(any(2)) {:ok, "aa"} iex(4)> "a" |> parse(any(2)) {:error, "expected exactly 2 characters at 1:1 but got "a""} iex(5)> "aaa" |> parse(any(at_least: 2)) {:ok, "aaa"} iex(6)> "aaa" |> parse(any(at_most: 2)) {:ok, "aa"}
  8. 8. iex(1)> "bbabcd" |> parse(while("abc")) {:ok, "bbabc"} iex(2)> "xxx" |> parse(while("abc")) {:ok, ""} iex(3)> "xxx" |> parse(while("abc", at_least: 2)) {:error, "expected at least 2 characters in alphabet "abc" at 1:1 but got "xx""} iex(4)> import Paco.ASCII, only: [lowercase?: 1] iex(5)> "abCD" |> parse(while(&lowercase?/1)) {:error, "ab"} iex(6)> "abCD" |> parse(while(&lowercase?/1, at_least: 3)) {:error, "expected at least 3 lowercase characters at 1:1 but got "abC""}
  9. 9. iex(1)> "abc" |> parse(until("c")) {:ok, "ab"} iex(2)> "abcdc" |> parse(until("c", escaped_with: "")) {:ok, "abcd"} iex(3)> "abcdc" |> parse(until("c", escaped_with: "", keep_escape: true)) {:ok, "abcd"} iex(4)> "abc" |> parse(until("d")) {:error, "expected something ended by "d" at 1:1 but got "abc""} iex(5)> "abc" |> parse(until("d"), eof: true) {:error, "abc"}
  10. 10. iex(1)> "ab" |> parse(sequence_of([lit("a"), lit("b")])) {:ok, ["a", "b"]} iex(2)> "ac" |> parse(sequence_of([lit("a"), lit("b")])) {:error, "expected "b" at 1:2 but got "c""}} iex(3)> ab = sequence_of([lit("a"), lit("b")]) %Paco.Parser{...} iex(4)> "abc" |> parse(sequence_of([ab, lit("c")])) {:ok, [["a", "b"], "c"]} iex(5)> "xxx" |> parse(sequence_of([ab, lit("c")])) {:error, "expected "a" at 1:1 but got "x""} iex(6)> "axx" |> parse(sequence_of([ab, lit("c")])) {:error, "expected "b" at 1:2 but got "x""} iex(7)> "abx" |> parse(sequence_of([ab, lit("c")])) {:error, "expected "c" at 1:3 but got "x""}
  11. 11. iex(1)> "a" |> parse(one_of([lit("a"), lit("b")])) {:ok, "a"} iex(2)> "b" |> parse(one_of([lit("a"), lit("b")])) {:ok, "b"} iex(4)> # farthest failure (higher rank) wins nil iex(3)> "ab" |> parse(one_of([lit("ac"), lit("bc")])) {:error, "expected "ac" at 1:1 but got "ab""} iex(6)> # failures with same rank are composed nil iex(5)> "ab" |> parse(one_of([lit("ac"), lit("ad")])) {:error, "expected one of ["ac", "ad"] at 1:1 but got "ab""}
  12. 12. iex(1)> "aaa" |> parse(repeat(lit("a"))) {:ok, ["a", "a", "a"]} iex(2)> "aaa" |> parse(repeat(lit("a"), 2)) {:ok, ["a", "a"]} iex(4)> "aaa" |> parse(repeat(lit("a"), at_most: 2)) {:ok, ["a", "a"]} iex(3)> "aaa" |> parse(repeat(lit("a"), at_least: 4)) {:error, ""expected "a" at 1:2 but got the end of input"} iex(6)> "abba" |> parse(repeat(one_of([lit("a"), lit("b")]))) {:ok, ["a", "b", "b", "a"]}
  13. 13. defmodule Paco.ASCII do ! @upper ["A","B","C","D","E",...,"Z"] ! @classes [... {:upper, :upper?, @upper}, ...] for {class, is_class, elements} <- @classes do def unquote(class)(), do: unquote(elements) for element <- elements do def unquote(is_class)(<<unquote(element)>>), do: true end def unquote(is_class)(_), do: false end ! # def upper, do: @upper # def upper?("A"), do: true # def upper?("B"), do: true # ... # def upper?(_), do: false
  14. 14. ws = while(ASCII.ws) ! hello = lit("Hello") separator = sequence_of([ws, lit(","), ws]) what = while(ASCII.letter, at_least: 1) terminator = sequence_of([ws, lit("!")]) ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello,World!", greetings) |> IO.inspect # {:ok, ["Hello", ["", ",", ""], "World", ["", "!"]]} ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["Hello", ["", ",", " "], "BEAM", ["", "!"]]}
  15. 15. # Good, not great: skip everything that is not interesting ! ws = while(ASCII.ws) ! hello = lit("Hello") |> skip separator = sequence_of([ws, lit(","), ws]) |> skip what = while(ASCII.letter, at_least: 1) terminator = sequence_of([ws, lit("!")]) |> skip ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello,World!", greetings) |> IO.inspect # {:ok, ["World"]} ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]}
  16. 16. # Not everyone are so loud, `!` should be optional ! ws = while(ASCII.ws) ! hello = lit("Hello") |> skip separator = sequence_of([ws, lit(","), ws]) |> skip what = while(ASCII.letter, at_least: 1) terminator = sequence_of([ws, lit("!")]) |> maybe ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]} ! parse("Hello, BEAM", greetings) |> IO.inspect # {:ok, ["BEAM"]}
  17. 17. # Let's get rid of non significant whitespaces with lex(s) ! # In module Paco.Parser... ! parser lex(s), as: lit(s) |> surrounded_by(maybe(whitespaces)) ! parser surrounded_by(p, left, right), as: sequence_of([skip(left), p, skip(right)])
  18. 18. # Use lex Luke! ! ws = while(ASCII.ws) ! hello = lit("Hello") |> skip what = while(ASCII.letter, at_least: 1) separator = lex(",") |> skip terminator = lex("!") |> maybe ! greetings = sequence_of([hello, separator, what, terminator]) ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]} ! parse("Hello, BEAM", greetings) |> IO.inspect # {:ok, ["BEAM"]}
  19. 19. # It's common to have something non significant # that follows or precedes something significant ! # In module Paco.Parser... ! parser followed_by(p, right), as: sequence_of([p, skip(right)]) ! parser preceded_by(p, right), as: sequence_of([skip(left), p])
  20. 20. # An alternative and shorter version ! what = while(ASCII.letter, at_least: 1) ! greetings = what |> preceded_by(lit("Hello") |> followed_by(lex(","))) |> followed_by(maybe(lex("!"))) ! ! parse("Hello, BEAM!", greetings) |> IO.inspect # {:ok, ["BEAM"]} ! parse("Hello, BEAM", greetings) |> IO.inspect # {:ok, ["BEAM"]}
  21. 21. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(operator) ! parse("1", expression) |> IO.inspect # {:ok, ["1"]} ! parse("1 + 2", expression) |> IO.inspect # {:ok, ["1", "2"]} ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, ["1", "2", "3"]} ! # Small problem... to compute the value we need the operators!
  22. 22. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(keep(operator)) ! parse("1", expression) |> IO.inspect # {:ok, ["1"]} ! parse("1 + 2", expression) |> IO.inspect # {:ok, ["1", "+", "2"]} ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, ["1", "+", "2", "-", "3"]} ! # Ok, but we need numbers not strings
  23. 23. # In module Paco.Parser... parser bind(p, f) do fn state, _ -> case p.parse.(state, p) do %Success{result: result} = success -> case f.(result, success) do %Failure{} = failure -> failure %Success{} = success -> success result -> %Success{success|result: result} end %Failure{} = failure -> failure end end end
  24. 24. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(keep(operator)) ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, [1, "+", 2, "-", 3]} ! # Missing only the last step... compute the result :-)
  25. 25. # Parse a sequence of numbers separated by `+` or `-` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! operator = one_of([lex("+"), lex("-")]) ! expression = number |> separated_by(keep(operator)) |> bind(&Paco.Transform.separated_by(&1, fn("+", n, m) -> n + m ("-", n, m) -> n - m end)) ! ! parse("1 + 2 - 3", expression) |> IO.inspect # {:ok, 0]}
  26. 26. # Parse a{n}b{n}c{n} where n ∈ ℕ ! # If you knew the `n` (ex. 3) it would be easy ! p = sequence_of([while("a", 3), while("b", 3), while("c", 3)]) ! parse("aaabbbccc", p) |> IO.inspect # {:ok, ["aaa", "bbb", "ccc"]} ! # We need to be able to peek ahead and then create a parser # with that knowledge
  27. 27. # In module Paco.Parser... ! parser peek(box(p)) do fn %State{at: at, text: text} = state, _ -> case p.parse.(state, p) do %Success{result: result} -> %Success{from: at, to: at, at: at, tail: text, result: result} %Failure{} = failure -> failure end end end
  28. 28. # In module Paco.Parser... ! parser then(p, f) when is_function(f), as: bind(p, f) |> bind(fn(p, _, s) -> p.parse.(s, p) end)
  29. 29. # Parse a{n}b{n}c{n} where n ∈ ℕ ! p = peek(while("a")) |> then(fn(a) -> len = String.length(a) sequence_of([while("a", len), while("b", len), while("c", len)]) end) ! parse("aaabbbccc", p) |> IO.inspect # {:ok, ["aaa", "bbb", "ccc"]} ! parse("aaabbccc", p) |> IO.inspect # {:error, "expected exactly 3 characters in alphabet "b" at 1:4 but got "bbc""}
  30. 30. # An `element` is a word beginning with one uppercase letter # followed by zero or more lowercase letters element = sequence_of([while(ASCII.upper, 1), while(ASCII.lower)]) ! # A `quantity` is a number greater than zero # If the quantity is omitted assume the value of 1 as default quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! # A `reference` is an element optionally followed by a quantity reference = sequence_of([element, quantity]) ! formula = repeat(reference, at_least: 1)
  31. 31. parse("H2O", formula) |> IO.inspect # {:ok, [[["H", ""], 2], [["O", ""], 1]]} ! # That's right but the output format sucks! ! # What we really want is something like # {:ok, [%{element: "H", quantity: 2}, %{element: "0", quantity: 1}] ! # Is that possible???
  32. 32. defprotocol Paco.Parsable do @moduledoc """ A protocol that converts terms into Paco parsers """ @fallback_to_any true @doc """ Returns a parser that parses `t` and keeps the shape of `t` """ @spec to_parser(t) :: Paco.Parser.t def to_parser(t) end
  33. 33. defimpl Paco.Parsable, for: BitString do import Paco.Parser def to_parser(s) when is_binary(s) do lit(s) end def to_parser(s) do raise Protocol.UndefinedError, protocol: @protocol, value: s end end iex(1)> "aaa" |> parse(lit("aaa")) {:ok, "aaa"} iex(2)> "aaa" |> parse("aaa") {:ok, "aaa"}
  34. 34. defimpl Paco.Parsable, for: List do import Paco.Parser def to_parser(l) do sequence_of(l) end end iex(1)> "ab" |> parse(sequence_of([lit("a"), lit("b")])) {:ok, ["a", "b"]} iex(2)> "ab" |> parse(sequence_of(["a", "b"])) {:ok, ["a", "b"]} iex(3)> "ab" |> parse(["a", "b"]) {:ok, ["a", "b"]}
  35. 35. defimpl Paco.Parsable, for: Tuple do import Paco.Parser def to_parser(tuple) do sequence_of(Tuple.to_list(tuple)) |> bind(&List.to_tuple/1) end end iex(1)> "ab" |> parse({"a", "b"})) {:ok, {"a", "b"}}
  36. 36. defimpl Paco.Parsable, for: Map do import Paco.Parser def to_parser(tuple) do {keys, values} = {Map.keys(map), Map.values(map)} sequence_of(values) |> bind(&(Enum.zip(keys, &1) |> Enum.into(Map.new))) end end iex(1)> "ab" |> parse(%{first: "a", last: "b"})) {:ok, %{first: "a", last: "b"}}
  37. 37. element = [while(ASCII.upper, 1), while(ASCII.lower)] ! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! reference = %{element: element, quantity: quantity} ! formula = repeat(reference, at_least: 1) ! parse("H2O", formula) |> IO.inspect # {:ok, [%{element: ["H", ""], quantity: 2}, # %{element: ["O", ""], quantity: 1}]} ! # Almost...
  38. 38. # parser join(p, joiner ""), # as: bind(p, &Enum.join(&1, joiner)) ! element = [while(ASCII.upper, 1), while(ASCII.lower)] |> join ! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! reference = %{element: element, quantity: quantity} ! formula = repeat(reference, at_least: 1) ! parse("H2O", formula) |> IO.inspect # {:ok, [%{element: "H", quantity: 2}, # %{element: "O", quantity: 1}]} ! # Yahoooo!!!
  39. 39. element = [while(ASCII.upper, 1), while(ASCII.lower)] |> join ! # Bub a `quantity` is a number greater than zero! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) ! reference = %{element: element, quantity: quantity} ! formula = repeat(reference, at_least: 1) ! parse("Na0", formula) |> IO.inspect # {:ok, [%{element: "Na", quantity: 0}]} ! # Ouch...
  40. 40. # ... # A `quantity` is a number greater than zero quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) |> only_if(&(&1 > 0)) ! # ... ! parse("Na0", formula) |> IO.inspect # {:error, "0 is not acceptable at 1:3"}
  41. 41. # ... # A `quantity` is a number greater than zero ! error_message = "quantity must be greather than 0 %AT%" ! greater_than_zero = &{&1 > 0, error_message} ! quantity = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) |> maybe(default: 1) |> only_if(greater_than_zero)) ! # ... ! parse("Na0", formula) |> IO.inspect # {:error, "quantity must be greather than 0 at 1:3"}
  42. 42. # Parse something like `(1, (2, 3))` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! # We need to name something that is not yet defined, # actually we need to name something in its definition ! list = one_of([number, ???]) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.round_brackets)
  43. 43. # In module Paco.Parser... ! parser recursive(f) do fn state, this -> box(f.(this)).parse.(state, this) end end
  44. 44. # Parse something like `(1, (2, 3))` ! number = while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) ! list = recursive(fn(list) -> one_of([number, list]) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.round_brackets) end) ! parse("(1, 2)", list) |> IO.inspect # {:ok, [1, 2]} ! parse("(1, (2, 3))", list) |> IO.inspect # {:ok, [1, [2, 3]]}
  45. 45. defmodule ListOfLists do use Paco alias Paco.ASCII ! parser number do while(ASCII.digit, at_least: 1) |> bind(&String.to_integer/1) end ! parser list do one_of([number, list]) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.round_brackets) end end ! Paco.parse("1", ListOfLists.number) |> IO.inspect # {:ok, 1}
  46. 46. # In module Paco... ! defmacro __using__(_) do quote do import Paco.Macro.ParserModuleDefinition import Paco.Parser ! Module.register_attribute(__MODULE__, :paco_parsers, accumulate: true) ! @before_compile Paco end end
  47. 47. # In module Paco... ! defmacro __before_compile__(env) do root_parser = pick_root_parser_between( Module.get_attribute(env.module, :paco_parsers) |> Enum.reverse ) ! quote do def parse(s, opts []) do Paco.parse(s, apply(__MODULE__, unquote(root_parser), []), opts) end ! def parse!(s, opts []) do Paco.parse!(s, apply(__MODULE__, unquote(root_parser), []), opts) end end end
  48. 48. # Everything we saw until now works with streams of text! ! ["a", "b", "", "ab", "", "a", "", "", "b", "", ""] |> Paco.Stream.parse(lit("ab")) |> Enum.to_list |> IO.inspect # ["ab", "ab", "ab"] ! [~s|{"foo|, ~s|": "bar"|, ~s|}[1, 2|, ~s|, 3]|] |> Paco.Parser.JSON.stream |> Enum.to_list |> IO.inspect # [%{"foo" => "bar"}, [1, 2, 3]]
  49. 49. parser lit(s) do fn %State{at: from, text: text, stream: stream} = state, this -> case Paco.String.consume(text, s, from) do {tail, _, to, at} -> %Success{from: from, to: to, at: at, tail: tail, result: s} ! {:not_enough, _, _, _, _} when is_pid(stream) -> wait_for_more_and_continue(state, this)
 {_, _, _, _, {n, _, _}} -> %Failure{at: from, tail: text, expected: s, rank: n+1} end end end !
  50. 50. defp wait_for_more_and_continue(state, this) do %State{text: text, stream: stream} = state send(stream, {self, :more}) receive do {:load, more_text} -> this.parse.(%State{state|text: text <> more_text}, this) :halted -> # The stream is over, switching to a non stream mode # is the same as to tell the parser to behave knowing # that more input will never come this.parse.(%State{state|stream: nil}, this) end end
  51. 51. defmodule Paco.Parser.JSON do alias Paco.ASCII use Paco ! root parser all, do: one_of([object, array]) ! parser object do pair(string, value, separated_by: ASCII.colon) |> separated_by(ASCII.comma) |> surrounded_by(ASCII.curly_brackets) |> bind(&to_map/1) end ! parser array do value |> separated_by(ASCII.comma) |> surrounded_by(ASCII.square_brackets) end # ...
  52. 52. defmodule Paco.Parser.JSON do # ... ! parser value do one_of([ string, number, object, array, literal_true, literal_false, literal_null]) end ! parser string do between(ASCII.double_quotes, escaped_with: "", strip: false) |> bind(&replace_escapes_in_string/1) end # ...
  53. 53. defmodule Paco.Parser.JSON do # ... ! parser literal_true, do: lit("true") |> replace_with(true) parser literal_false, do: lit("false") |> replace_with(false) parser literal_null, do: lit("null") |> replace_with(nil) ! # ... end
  54. 54. Settings: duration: 1.0 s ! ## Paco.Benchmark.JSON [00:21:14] 1/4: poison small [00:21:16] 2/4: poison medium [00:21:18] 3/4: paco small [00:21:21] 4/4: paco medium ! Finished in 8.78 seconds ! ## Paco.Benchmark.JSON poison small 100000 14.72 µs/op poison medium 10000 144.58 µs/op paco small 5000 493.32 µs/op paco medium 500 4152.14 µs/op
  55. 55. ✓ ✓ ☕ ☕ 👎 $

×