The document discusses metasearch engines, which combine search queries across multiple search engines using their indices. It provides examples of code in Python for implementing a basic web search engine that crawls web pages, indexes keywords and URLs, looks up keywords in the index, and gets links from pages. It also includes links to resources about information retrieval systems and metasearch engines.
7. Código Python – Web Search Engine
def union(a, b): cache = {
def crawl_web(seed): # returns index, graph of for e in b: 'http://www.udacity.com/cs101x/final/multi.html': """<html>
inlinks <body>
if e not in a:
tocrawl = [seed]
a.append(e)
crawled = [] <a href="http://www.udacity.com/cs101x/final/a.html">A</a><br>
graph = {} # <url>, [list of pages it links to] <a href="http://www.udacity.com/cs101x/final/b.html">B</a><br>
def add_page_to_index(index, url, content):
index = {}
words = content.split() </body>
while tocrawl:
pos=0 """,
page = tocrawl.pop()
for word in words: 'http://www.udacity.com/cs101x/final/b.html': """<html>
if page not in crawled: <body>
pos=content.find(word, pos)
content = get_page(page)
add_to_index(index, word, url,pos)
add_page_to_index(index, page, content) Monty likes the Python programming language
outlinks = get_all_links(content) Thomas Jefferson founded the University of Virginia
def add_to_index(index, keyword, url,pos): When Mandela was in London, he visited Nelson's Column.
graph[page] = outlinks
if keyword in index:
union(tocrawl, outlinks)
index[keyword].append([url,pos]) </body>
crawled.append(page)
else: </html>
return index, graph """,
index[keyword] = [[url,pos]]
'http://www.udacity.com/cs101x/final/a.html': """<html>
def get_next_target(page): <body>
def lookup(index, keyword):
start_link = page.find('<a href=')
if keyword in index: Monty Python is not about a programming language
if start_link == -1:
return index[keyword] Udacity was not founded by Thomas Jefferson
return None, 0
else: Nelson Mandela said "Education is the most powerful weapon
start_quote = page.find('"', start_link)
return None which you can
end_quote = page.find('"', start_quote + 1) use to change the world."
url = page[start_quote + 1:end_quote] </body>
return url, end_quote </html>
""",
def get_all_links(page): }
links = []
def get_page(url):
while True:
if url in cache:
url, endpos = get_next_target(page) return cache[url]
if url: else:
links.append(url) print "Page not in cache: " + url
page = page[endpos:] return None
else:
break
return links
http://www.udacity.com/cs101