SlideShare a Scribd company logo
CL-NLP –
A Natural Language Processing
Library for Common Lisp
Vsevolod Dyomkin
Topics
* Motivation
* Library high-level design
* Overview of the functions
* Some technical details
* Next steps
Motivation
NLP & Lisp is a match, made
in heaven
Previous Work
* CL-Langutils
* Lexiparse, Sparser, CL-EARLY-
PARSER, Basic-English-Grammar
* Wordnet interfaces
* CMU AI Repository
* Natural Language Understanding
* Natural Language Processing in
Lisp
Essential Scope
* Corpora interfaces
* Language modeling
* Measures
* Tokenization
* Tagging
* Parsing
* Wordnet interface
* Construct a pipeline
TDD, BDD, CDD
An NLP Pipeline
“This is a test.” ->
TOP
|
S
/~~~~~~~~~~|~~~~~~~~
NP VP .
| /~~~~~~ |
DT VBZ NP .
| | /~~~~
This is DT NN
| |
a test
An NLP Pipeline
(print-parse-trees "This is a test.")
(defun print-parse-trees (text)
(dolist (sentence
(tokenize <sentence-tokenizer> text))
(pprint-tree (parse *pcfg*
(tag *hmm* text)))))
(defparameter *hmm*
(train 'hmm-tagger "treebank_dir/"))
(defparameter *pcfg*
(train 'pcfg "treebank_dir/"))
Some Design Choices
* A pseudo-hierarchical
package design
High-level Design
* cl-nlp & cl-nlp-contrib
& more
* base & functional packages
* nlp-user
Modules
* nlp.core, nlp.corpora,
nlp.util, nlp.test-util
* nlp.syntax, nlp.generation,
nlp.phonetics
* nlp.contrib.wordnet,
nlp.contrib.ms-ngrams
Some Design Choices
* Use the whole language
- defclass & defstruct
- hash-tables & alists
- do & loop
Some Design Choices
* Grow the language
- rutils
+ reader macros
+ shorthands
+ dotable, dolines
- special-purpose utils
General-Purpose Utils
(json:encode-json #{
"success" t
"sentence" text
"match"
(when-it (2nd (funcall fn (get-model text)))
(strcat
"{"
(strjoin ","
(mapcar #`(fmt ""~A":[~A,~A]"
(car %) (cadr %)
(cddr %))
(ht->alist it)))
"}"))
} stream)
A Special-Purpose Util
(define-lazy-singleton word-tokenizer
(make 'postprocessing-regex-word-tokenizer)
"Default word tokenizer.")
(defun tokenize-ngram (ngrams str)
"Transform string STR to a list if necessary
(depending of order of NGRAMS)."
(if (> (ngrams-order ngrams) 1)
(tokenize <word-tokenizer> str)
str))
A Special-Purpose Util
(defmacro define-lazy-singleton
(name init &optional docstring)
(with-gensyms (singleton)
`(let (,singleton)
(defun ,name ()
,docstring
(or ,singleton
(setf ,singleton ,init)))
(define-symbol-macro
,(mksym name :format "<~A>")
(,name)))))
Some Design Choices
* Use CLOS as a foundation
Basic Cell
(defclass regex-word-tokenizer (tokenizer)
((regex :accessor tokenizer-regex
:initarg :regex
:initform
(re:create-scanner
"w+|[!"#$%&'*+,./:;<=>?@^`~…()
{}[|] «»“”‘’¶-]"⟨⟩ ‒–—― )
:documentation
"A simpler variant would be [^s]+ —
it doesn't split punctuation, yet
sometimes it's desirable."))
(:documentation
"Regex-based word tokenizer."))
Basic Cell
(defmethod tokenize
((tokenizer regex-word-tokenizer) string)
(loop
:for (beg end)
:on (re:all-matches (tokenizer-regex
tokenizer)
string)
:by #'cddr
:collect (sub string beg end) :into words
:collect (cons beg end) :into spans
:finally (return (values words
spans)))
Another Example
(defgeneric parse (model sentence)
(:documentation
"Parse SENTENCE with MODEL.")
(:method :around (model (sentence string))
(call-next-method
model (tokenize <word-tokenizer> string))))
(defgeneric parse-n (model sentence n)
(:documentation
"Return N best parse trees of the SENTENCE
with MODEL.")
(:method :around (model (sentence string) n)
(call-next-method
model (tokenize <word-tokenizer> string) n)))
Parsing
(defmethod parse ((grammar pcfg) (sentence list))
(CKY (let* ((cur (cons rule (1- s)))
(l (@ pi0 (1- i) (1- s)
(second rule)))
(r (@ pi0 s (1- j) (third rule)))
(score (if (and l r)
(+ (log q) l r)
min)))
(when (> score (or max min))
(setf max score
arg cur)))))
Parsing
(defmethod parse :around ((grammar pcfg)
(sentence list))
(with-raw-results
(values (idx->nts (decode-parse-tree
sentence bps 0 last iroot))
(exp (or (@ pi0 0 last iroot) min))
pi0
bps)))
(macrolet
((CKY (&body body)
`(with-slots (rules nts->idx) grammar
(let* ((pi0 #{}) (bps #{})
(min most-negative-single-float))
;; init pi0 & bps (skipped)
(do ((pos 1 (1+ pos)))
((>= pos *sentence-length*))
(do ((i 1 (1+ i)))
((> i (- *sentence-length* pos)))
(let ((j (+ i pos)))
(dotable (_ k nts->idx)
(let (max arg)
(do ((s i (1+ s)))
((>= s j))
(dotable (rule q rules)
(when (and (tryadic rule)
(= k (first rule)))
,@body)))
(when (if (listp max) max (> max min))
(setf (@ pi0 (1- i) (1- j) k) max
(@ bps (1- i) (1- j) k) arg)))))))
(values pi0 bps)))))
(declaim (inline @))
(defun @ (m i j k)
(get# (+ (* i *sentence-length* *nt-count*)
(* j *nt-count*)
k)
m))
(defsetf @ (m i j k) (v)
`(set# (+ (* ,i *sentence-length* *nt-count*)
(* ,j *nt-count*)
,k)
,m ,v))
http://lisp-univ-etc.blogspot.com/search/label/nltk

More Related Content

What's hot

Getting groovy (ODP)
Getting groovy (ODP)Getting groovy (ODP)
Getting groovy (ODP)
Nick Dixon
 
The Rust Borrow Checker
The Rust Borrow CheckerThe Rust Borrow Checker
The Rust Borrow Checker
Nell Shamrell-Harrington
 
Austin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon BelarusAustin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon Belarus
Alina Dolgikh
 
On UnQLite
On UnQLiteOn UnQLite
On UnQLite
charsbar
 
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Zabbix
 
Modern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettextModern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettext
Alexander Mostovenko
 
Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout
source{d}
 
Writing and using php streams and sockets
Writing and using php streams and socketsWriting and using php streams and sockets
Writing and using php streams and sockets
Elizabeth Smith
 
"Развитие ветки PHP-7"
"Развитие ветки PHP-7""Развитие ветки PHP-7"
"Развитие ветки PHP-7"
Badoo Development
 
Perl 5 & 6 regex complexity
Perl 5 & 6 regex complexityPerl 5 & 6 regex complexity
Perl 5 & 6 regex complexity
Radoslaw Kotowicz
 
The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212
Mahmoud Samir Fayed
 
基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计勇浩 赖
 
The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184
Mahmoud Samir Fayed
 
Is boilerplate code really so bad?
Is boilerplate code really so bad?Is boilerplate code really so bad?
Is boilerplate code really so bad?
Trisha Gee
 
JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5Stephan Schmidt
 
Gradle in a Polyglot World
Gradle in a Polyglot WorldGradle in a Polyglot World
Gradle in a Polyglot World
Schalk Cronjé
 
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Ontico
 
WordPress Performance Tuning
WordPress Performance TuningWordPress Performance Tuning
WordPress Performance Tuning
Javier Arturo Rodríguez
 
Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)
Joachim Baumann
 

What's hot (20)

Getting groovy (ODP)
Getting groovy (ODP)Getting groovy (ODP)
Getting groovy (ODP)
 
The Rust Borrow Checker
The Rust Borrow CheckerThe Rust Borrow Checker
The Rust Borrow Checker
 
Austin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon BelarusAustin Bingham. Python Refactoring. PyCon Belarus
Austin Bingham. Python Refactoring. PyCon Belarus
 
On UnQLite
On UnQLiteOn UnQLite
On UnQLite
 
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
Rihards Olups - Encrypting Daemon Traffic With Zabbix 3.0
 
Modern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettextModern javascript localization with c-3po and the good old gettext
Modern javascript localization with c-3po and the good old gettext
 
Os Welton
Os WeltonOs Welton
Os Welton
 
Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout Introduction to source{d} Engine and source{d} Lookout
Introduction to source{d} Engine and source{d} Lookout
 
Writing and using php streams and sockets
Writing and using php streams and socketsWriting and using php streams and sockets
Writing and using php streams and sockets
 
"Развитие ветки PHP-7"
"Развитие ветки PHP-7""Развитие ветки PHP-7"
"Развитие ветки PHP-7"
 
Perl 5 & 6 regex complexity
Perl 5 & 6 regex complexityPerl 5 & 6 regex complexity
Perl 5 & 6 regex complexity
 
The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212The Ring programming language version 1.10 book - Part 49 of 212
The Ring programming language version 1.10 book - Part 49 of 212
 
基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计基于 Google protobuf 的 webgame 网络协议设计
基于 Google protobuf 的 webgame 网络协议设计
 
The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184The Ring programming language version 1.5.3 book - Part 39 of 184
The Ring programming language version 1.5.3 book - Part 39 of 184
 
Is boilerplate code really so bad?
Is boilerplate code really so bad?Is boilerplate code really so bad?
Is boilerplate code really so bad?
 
JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5JSON-RPC Proxy Generation with PHP 5
JSON-RPC Proxy Generation with PHP 5
 
Gradle in a Polyglot World
Gradle in a Polyglot WorldGradle in a Polyglot World
Gradle in a Polyglot World
 
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
Linux kernel TLS и HTTPS / Александр Крижановский (Tempesta Technologies)
 
WordPress Performance Tuning
WordPress Performance TuningWordPress Performance Tuning
WordPress Performance Tuning
 
Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)Introduction to Groovy (Serbian Developer Conference 2013)
Introduction to Groovy (Serbian Developer Conference 2013)
 

Viewers also liked

Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)
Vsevolod Dyomkin
 
Aspects of NLP Practice
Aspects of NLP PracticeAspects of NLP Practice
Aspects of NLP Practice
Vsevolod Dyomkin
 
Tedxkyiv communication guidelines
Tedxkyiv communication guidelinesTedxkyiv communication guidelines
Tedxkyiv communication guidelinesVsevolod Dyomkin
 
Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?Vsevolod Dyomkin
 
Новые нереляционные системы хранения данных
Новые нереляционные системы хранения данныхНовые нереляционные системы хранения данных
Новые нереляционные системы хранения данныхVsevolod Dyomkin
 
Lisp как универсальная обертка
Lisp как универсальная оберткаLisp как универсальная обертка
Lisp как универсальная оберткаVsevolod Dyomkin
 
Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?
Vsevolod Dyomkin
 
NLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language IdentificationNLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language Identification
Vsevolod Dyomkin
 
Practical NLP with Lisp
Practical NLP with LispPractical NLP with Lisp
Practical NLP with Lisp
Vsevolod Dyomkin
 
Lisp for Python Programmers
Lisp for Python ProgrammersLisp for Python Programmers
Lisp for Python ProgrammersVsevolod Dyomkin
 
Экосистема Common Lisp
Экосистема Common LispЭкосистема Common Lisp
Экосистема Common Lisp
Vsevolod Dyomkin
 
NLP Project Full Cycle
NLP Project Full CycleNLP Project Full Cycle
NLP Project Full Cycle
Vsevolod Dyomkin
 
Sugaring Lisp for the 21st Century
Sugaring Lisp for the 21st CenturySugaring Lisp for the 21st Century
Sugaring Lisp for the 21st Century
Vsevolod Dyomkin
 
Crash-course in Natural Language Processing
Crash-course in Natural Language ProcessingCrash-course in Natural Language Processing
Crash-course in Natural Language Processing
Vsevolod Dyomkin
 
Natural Language Processing in Practice
Natural Language Processing in PracticeNatural Language Processing in Practice
Natural Language Processing in PracticeVsevolod Dyomkin
 
Natural language processing
Natural language processingNatural language processing
Natural language processing
Hansi Thenuwara
 

Viewers also liked (17)

Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)Crash Course in Natural Language Processing (2016)
Crash Course in Natural Language Processing (2016)
 
Aspects of NLP Practice
Aspects of NLP PracticeAspects of NLP Practice
Aspects of NLP Practice
 
Tedxkyiv communication guidelines
Tedxkyiv communication guidelinesTedxkyiv communication guidelines
Tedxkyiv communication guidelines
 
Lisp Machine Prunciples
Lisp Machine PrunciplesLisp Machine Prunciples
Lisp Machine Prunciples
 
Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?Чему мы можем научиться у Lisp'а?
Чему мы можем научиться у Lisp'а?
 
Новые нереляционные системы хранения данных
Новые нереляционные системы хранения данныхНовые нереляционные системы хранения данных
Новые нереляционные системы хранения данных
 
Lisp как универсальная обертка
Lisp как универсальная оберткаLisp как универсальная обертка
Lisp как универсальная обертка
 
Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?Can functional programming be liberated from static typing?
Can functional programming be liberated from static typing?
 
NLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language IdentificationNLP in the WILD or Building a System for Text Language Identification
NLP in the WILD or Building a System for Text Language Identification
 
Practical NLP with Lisp
Practical NLP with LispPractical NLP with Lisp
Practical NLP with Lisp
 
Lisp for Python Programmers
Lisp for Python ProgrammersLisp for Python Programmers
Lisp for Python Programmers
 
Экосистема Common Lisp
Экосистема Common LispЭкосистема Common Lisp
Экосистема Common Lisp
 
NLP Project Full Cycle
NLP Project Full CycleNLP Project Full Cycle
NLP Project Full Cycle
 
Sugaring Lisp for the 21st Century
Sugaring Lisp for the 21st CenturySugaring Lisp for the 21st Century
Sugaring Lisp for the 21st Century
 
Crash-course in Natural Language Processing
Crash-course in Natural Language ProcessingCrash-course in Natural Language Processing
Crash-course in Natural Language Processing
 
Natural Language Processing in Practice
Natural Language Processing in PracticeNatural Language Processing in Practice
Natural Language Processing in Practice
 
Natural language processing
Natural language processingNatural language processing
Natural language processing
 

Similar to CL-NLP

Enhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through OntologyEnhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through Ontology
Chunhua Liao
 
Perl at SkyCon'12
Perl at SkyCon'12Perl at SkyCon'12
Perl at SkyCon'12
Tim Bunce
 
ElasticSearch for .NET Developers
ElasticSearch for .NET DevelopersElasticSearch for .NET Developers
ElasticSearch for .NET Developers
Ben van Mol
 
Perl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one linersPerl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one liners
Kirk Kimmel
 
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
PROIDEA
 
Learning Puppet basic thing
Learning Puppet basic thing Learning Puppet basic thing
Learning Puppet basic thing
DaeHyung Lee
 
What we can learn from Rebol?
What we can learn from Rebol?What we can learn from Rebol?
What we can learn from Rebol?
lichtkind
 
JRuby e DSL
JRuby e DSLJRuby e DSL
JRuby e DSL
jodosha
 
APMG juni 2014 - Regular Expression
APMG juni 2014 - Regular ExpressionAPMG juni 2014 - Regular Expression
APMG juni 2014 - Regular ExpressionByte
 
Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02Ramamohan Chokkam
 
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj TalkSpark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Zalando Technology
 
Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)Vitaly Baum
 
Neo4j after 1 year in production
Neo4j after 1 year in productionNeo4j after 1 year in production
Neo4j after 1 year in production
Andrew Nikishaev
 
Scala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning TalkScala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning Talk
Lior Schejter
 
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Andrea Telatin
 
Apache Spark Workshop
Apache Spark WorkshopApache Spark Workshop
Apache Spark Workshop
Michael Spector
 
Redis: REmote DIctionary Server
Redis: REmote DIctionary ServerRedis: REmote DIctionary Server
Redis: REmote DIctionary ServerEzra Zygmuntowicz
 
A Deep Dive Into Spark
A Deep Dive Into SparkA Deep Dive Into Spark
A Deep Dive Into Spark
Ashish kumar
 
DSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka VälimaaDSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka Välimaa
Montel Intergalactic
 

Similar to CL-NLP (20)

Enhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through OntologyEnhancing Domain Specific Language Implementations Through Ontology
Enhancing Domain Specific Language Implementations Through Ontology
 
Perl at SkyCon'12
Perl at SkyCon'12Perl at SkyCon'12
Perl at SkyCon'12
 
ElasticSearch for .NET Developers
ElasticSearch for .NET DevelopersElasticSearch for .NET Developers
ElasticSearch for .NET Developers
 
Perl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one linersPerl - laziness, impatience, hubris, and one liners
Perl - laziness, impatience, hubris, and one liners
 
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
JDD 2016 - Tomasz Borek - DB for next project? Why, Postgres, of course
 
Learning Puppet basic thing
Learning Puppet basic thing Learning Puppet basic thing
Learning Puppet basic thing
 
What we can learn from Rebol?
What we can learn from Rebol?What we can learn from Rebol?
What we can learn from Rebol?
 
JRuby e DSL
JRuby e DSLJRuby e DSL
JRuby e DSL
 
APMG juni 2014 - Regular Expression
APMG juni 2014 - Regular ExpressionAPMG juni 2014 - Regular Expression
APMG juni 2014 - Regular Expression
 
Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02Jsonsaga 100605143125-phpapp02
Jsonsaga 100605143125-phpapp02
 
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj TalkSpark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
Spark + Clojure for Topic Discovery - Zalando Tech Clojure/Conj Talk
 
Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)Building DSLs On CLR and DLR (Microsoft.NET)
Building DSLs On CLR and DLR (Microsoft.NET)
 
Neo4j after 1 year in production
Neo4j after 1 year in productionNeo4j after 1 year in production
Neo4j after 1 year in production
 
Meta Object Protocols
Meta Object ProtocolsMeta Object Protocols
Meta Object Protocols
 
Scala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning TalkScala Parser Combinators - Scalapeno Lightning Talk
Scala Parser Combinators - Scalapeno Lightning Talk
 
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
Bioinformatica: Esercizi su Perl, espressioni regolari e altre amenità (BMR G...
 
Apache Spark Workshop
Apache Spark WorkshopApache Spark Workshop
Apache Spark Workshop
 
Redis: REmote DIctionary Server
Redis: REmote DIctionary ServerRedis: REmote DIctionary Server
Redis: REmote DIctionary Server
 
A Deep Dive Into Spark
A Deep Dive Into SparkA Deep Dive Into Spark
A Deep Dive Into Spark
 
DSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka VälimaaDSLs for fun and profit by Jukka Välimaa
DSLs for fun and profit by Jukka Välimaa
 

Recently uploaded

Epistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI supportEpistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI support
Alan Dix
 
Introduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - CybersecurityIntroduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - Cybersecurity
mikeeftimakis1
 
Uni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdfUni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems S.M.S.A.
 
By Design, not by Accident - Agile Venture Bolzano 2024
By Design, not by Accident - Agile Venture Bolzano 2024By Design, not by Accident - Agile Venture Bolzano 2024
By Design, not by Accident - Agile Venture Bolzano 2024
Pierluigi Pugliese
 
Pushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 daysPushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 days
Adtran
 
Artificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopmentArtificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopment
Octavian Nadolu
 
GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024
GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024
GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024
Neo4j
 
PCI PIN Basics Webinar from the Controlcase Team
PCI PIN Basics Webinar from the Controlcase TeamPCI PIN Basics Webinar from the Controlcase Team
PCI PIN Basics Webinar from the Controlcase Team
ControlCase
 
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdfObservability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Paige Cruz
 
Enchancing adoption of Open Source Libraries. A case study on Albumentations.AI
Enchancing adoption of Open Source Libraries. A case study on Albumentations.AIEnchancing adoption of Open Source Libraries. A case study on Albumentations.AI
Enchancing adoption of Open Source Libraries. A case study on Albumentations.AI
Vladimir Iglovikov, Ph.D.
 
Generative AI Deep Dive: Advancing from Proof of Concept to Production
Generative AI Deep Dive: Advancing from Proof of Concept to ProductionGenerative AI Deep Dive: Advancing from Proof of Concept to Production
Generative AI Deep Dive: Advancing from Proof of Concept to Production
Aggregage
 
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
SOFTTECHHUB
 
20240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 202420240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 2024
Matthew Sinclair
 
UiPath Test Automation using UiPath Test Suite series, part 5
UiPath Test Automation using UiPath Test Suite series, part 5UiPath Test Automation using UiPath Test Suite series, part 5
UiPath Test Automation using UiPath Test Suite series, part 5
DianaGray10
 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
Kari Kakkonen
 
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
James Anderson
 
Free Complete Python - A step towards Data Science
Free Complete Python - A step towards Data ScienceFree Complete Python - A step towards Data Science
Free Complete Python - A step towards Data Science
RinaMondal9
 
RESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for studentsRESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for students
KAMESHS29
 
Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1
DianaGray10
 
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
名前 です男
 

Recently uploaded (20)

Epistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI supportEpistemic Interaction - tuning interfaces to provide information for AI support
Epistemic Interaction - tuning interfaces to provide information for AI support
 
Introduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - CybersecurityIntroduction to CHERI technology - Cybersecurity
Introduction to CHERI technology - Cybersecurity
 
Uni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdfUni Systems Copilot event_05062024_C.Vlachos.pdf
Uni Systems Copilot event_05062024_C.Vlachos.pdf
 
By Design, not by Accident - Agile Venture Bolzano 2024
By Design, not by Accident - Agile Venture Bolzano 2024By Design, not by Accident - Agile Venture Bolzano 2024
By Design, not by Accident - Agile Venture Bolzano 2024
 
Pushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 daysPushing the limits of ePRTC: 100ns holdover for 100 days
Pushing the limits of ePRTC: 100ns holdover for 100 days
 
Artificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopmentArtificial Intelligence for XMLDevelopment
Artificial Intelligence for XMLDevelopment
 
GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024
GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024
GraphSummit Singapore | Neo4j Product Vision & Roadmap - Q2 2024
 
PCI PIN Basics Webinar from the Controlcase Team
PCI PIN Basics Webinar from the Controlcase TeamPCI PIN Basics Webinar from the Controlcase Team
PCI PIN Basics Webinar from the Controlcase Team
 
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdfObservability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
Observability Concepts EVERY Developer Should Know -- DeveloperWeek Europe.pdf
 
Enchancing adoption of Open Source Libraries. A case study on Albumentations.AI
Enchancing adoption of Open Source Libraries. A case study on Albumentations.AIEnchancing adoption of Open Source Libraries. A case study on Albumentations.AI
Enchancing adoption of Open Source Libraries. A case study on Albumentations.AI
 
Generative AI Deep Dive: Advancing from Proof of Concept to Production
Generative AI Deep Dive: Advancing from Proof of Concept to ProductionGenerative AI Deep Dive: Advancing from Proof of Concept to Production
Generative AI Deep Dive: Advancing from Proof of Concept to Production
 
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
Why You Should Replace Windows 11 with Nitrux Linux 3.5.0 for enhanced perfor...
 
20240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 202420240609 QFM020 Irresponsible AI Reading List May 2024
20240609 QFM020 Irresponsible AI Reading List May 2024
 
UiPath Test Automation using UiPath Test Suite series, part 5
UiPath Test Automation using UiPath Test Suite series, part 5UiPath Test Automation using UiPath Test Suite series, part 5
UiPath Test Automation using UiPath Test Suite series, part 5
 
DevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA ConnectDevOps and Testing slides at DASA Connect
DevOps and Testing slides at DASA Connect
 
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
Alt. GDG Cloud Southlake #33: Boule & Rebala: Effective AppSec in SDLC using ...
 
Free Complete Python - A step towards Data Science
Free Complete Python - A step towards Data ScienceFree Complete Python - A step towards Data Science
Free Complete Python - A step towards Data Science
 
RESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for studentsRESUME BUILDER APPLICATION Project for students
RESUME BUILDER APPLICATION Project for students
 
Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1Communications Mining Series - Zero to Hero - Session 1
Communications Mining Series - Zero to Hero - Session 1
 
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
みなさんこんにちはこれ何文字まで入るの?40文字以下不可とか本当に意味わからないけどこれ限界文字数書いてないからマジでやばい文字数いけるんじゃないの?えこ...
 

CL-NLP

  • 1. CL-NLP – A Natural Language Processing Library for Common Lisp Vsevolod Dyomkin
  • 2. Topics * Motivation * Library high-level design * Overview of the functions * Some technical details * Next steps
  • 3. Motivation NLP & Lisp is a match, made in heaven
  • 4. Previous Work * CL-Langutils * Lexiparse, Sparser, CL-EARLY- PARSER, Basic-English-Grammar * Wordnet interfaces * CMU AI Repository * Natural Language Understanding * Natural Language Processing in Lisp
  • 5. Essential Scope * Corpora interfaces * Language modeling * Measures * Tokenization * Tagging * Parsing * Wordnet interface * Construct a pipeline
  • 7. An NLP Pipeline “This is a test.” -> TOP | S /~~~~~~~~~~|~~~~~~~~ NP VP . | /~~~~~~ | DT VBZ NP . | | /~~~~ This is DT NN | | a test
  • 8. An NLP Pipeline (print-parse-trees "This is a test.") (defun print-parse-trees (text) (dolist (sentence (tokenize <sentence-tokenizer> text)) (pprint-tree (parse *pcfg* (tag *hmm* text))))) (defparameter *hmm* (train 'hmm-tagger "treebank_dir/")) (defparameter *pcfg* (train 'pcfg "treebank_dir/"))
  • 9. Some Design Choices * A pseudo-hierarchical package design
  • 10. High-level Design * cl-nlp & cl-nlp-contrib & more * base & functional packages * nlp-user
  • 11. Modules * nlp.core, nlp.corpora, nlp.util, nlp.test-util * nlp.syntax, nlp.generation, nlp.phonetics * nlp.contrib.wordnet, nlp.contrib.ms-ngrams
  • 12. Some Design Choices * Use the whole language - defclass & defstruct - hash-tables & alists - do & loop
  • 13. Some Design Choices * Grow the language - rutils + reader macros + shorthands + dotable, dolines - special-purpose utils
  • 14. General-Purpose Utils (json:encode-json #{ "success" t "sentence" text "match" (when-it (2nd (funcall fn (get-model text))) (strcat "{" (strjoin "," (mapcar #`(fmt ""~A":[~A,~A]" (car %) (cadr %) (cddr %)) (ht->alist it))) "}")) } stream)
  • 15. A Special-Purpose Util (define-lazy-singleton word-tokenizer (make 'postprocessing-regex-word-tokenizer) "Default word tokenizer.") (defun tokenize-ngram (ngrams str) "Transform string STR to a list if necessary (depending of order of NGRAMS)." (if (> (ngrams-order ngrams) 1) (tokenize <word-tokenizer> str) str))
  • 16. A Special-Purpose Util (defmacro define-lazy-singleton (name init &optional docstring) (with-gensyms (singleton) `(let (,singleton) (defun ,name () ,docstring (or ,singleton (setf ,singleton ,init))) (define-symbol-macro ,(mksym name :format "<~A>") (,name)))))
  • 17. Some Design Choices * Use CLOS as a foundation
  • 18. Basic Cell (defclass regex-word-tokenizer (tokenizer) ((regex :accessor tokenizer-regex :initarg :regex :initform (re:create-scanner "w+|[!"#$%&'*+,./:;<=>?@^`~…() {}[|] «»“”‘’¶-]"⟨⟩ ‒–—― ) :documentation "A simpler variant would be [^s]+ — it doesn't split punctuation, yet sometimes it's desirable.")) (:documentation "Regex-based word tokenizer."))
  • 19. Basic Cell (defmethod tokenize ((tokenizer regex-word-tokenizer) string) (loop :for (beg end) :on (re:all-matches (tokenizer-regex tokenizer) string) :by #'cddr :collect (sub string beg end) :into words :collect (cons beg end) :into spans :finally (return (values words spans)))
  • 20. Another Example (defgeneric parse (model sentence) (:documentation "Parse SENTENCE with MODEL.") (:method :around (model (sentence string)) (call-next-method model (tokenize <word-tokenizer> string)))) (defgeneric parse-n (model sentence n) (:documentation "Return N best parse trees of the SENTENCE with MODEL.") (:method :around (model (sentence string) n) (call-next-method model (tokenize <word-tokenizer> string) n)))
  • 21. Parsing (defmethod parse ((grammar pcfg) (sentence list)) (CKY (let* ((cur (cons rule (1- s))) (l (@ pi0 (1- i) (1- s) (second rule))) (r (@ pi0 s (1- j) (third rule))) (score (if (and l r) (+ (log q) l r) min))) (when (> score (or max min)) (setf max score arg cur)))))
  • 22. Parsing (defmethod parse :around ((grammar pcfg) (sentence list)) (with-raw-results (values (idx->nts (decode-parse-tree sentence bps 0 last iroot)) (exp (or (@ pi0 0 last iroot) min)) pi0 bps)))
  • 23. (macrolet ((CKY (&body body) `(with-slots (rules nts->idx) grammar (let* ((pi0 #{}) (bps #{}) (min most-negative-single-float)) ;; init pi0 & bps (skipped) (do ((pos 1 (1+ pos))) ((>= pos *sentence-length*)) (do ((i 1 (1+ i))) ((> i (- *sentence-length* pos))) (let ((j (+ i pos))) (dotable (_ k nts->idx) (let (max arg) (do ((s i (1+ s))) ((>= s j)) (dotable (rule q rules) (when (and (tryadic rule) (= k (first rule))) ,@body))) (when (if (listp max) max (> max min)) (setf (@ pi0 (1- i) (1- j) k) max (@ bps (1- i) (1- j) k) arg))))))) (values pi0 bps)))))
  • 24. (declaim (inline @)) (defun @ (m i j k) (get# (+ (* i *sentence-length* *nt-count*) (* j *nt-count*) k) m)) (defsetf @ (m i j k) (v) `(set# (+ (* ,i *sentence-length* *nt-count*) (* ,j *nt-count*) ,k) ,m ,v))