環境準備(Ubuntu)
ここでは、Webページをダウンロードするためにurllib3というライブラリを使用し、Webページの解釈のためにBeautifulSoupというライブラリを使用する。
以下のコマンドを叩くことでそれらのライブラリをインストールすることが出来る。
$sudo apt-get install python-pip $sudo pip install urllib3 $sudo pip install beautifulsoup
クローラーを作る
初期ページのURLを与えたら、自動的にリンクを探して各ページを巡回するクローラーを作ってみる。まずは巡回のみ
まずは、リンクを巡回する部分だ。これを実行すると以下の出力が得られる。#!/usr/bin/python import urllib3 from BeautifulSoup import * from urlparse import urljoin # Make ignore word set ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class crawler: def __init__(self,dbname): pass def __del__(self): pass def dbcommit(self): pass def getentryid(self, table, field, value, createnew=True): return None def addtoindex(self,url,soup): print 'Indexing %s' % url def gettextonly(self, soup): return None def separatewords(self, text): return None def isindexed(self,url): return False def addlinkref(self, urlFrom, urlTo, linkText): pass def createindextables(self): pass def crawl(self, pages, depth=2): # Loop with depth for i in range(depth): newpages = set() for page in pages: # Download page's contents try: http = urllib3.PoolManager() contents = http.request('GET', page).data except: print "Could not open %s" % page continue # Register index soup = BeautifulSoup(contents) self.addtoindex(page, soup) # Crawl links links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # delete anchor if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages if __name__ == '__main__': pagelist=['http://kiwitobes.com/wiki/Perl.html'] crawler = crawler('') crawler.crawl(pagelist)
Indexing http://kiwitobes.com/wiki/Perl.html Indexing http://kiwitobes.com/wiki/Module_%28programming%29.html Indexing http://kiwitobes.com/wiki/Open_Directory_Project.html Indexing http://kiwitobes.com/wiki/Common_Gateway_Interface.html Indexing http://kiwitobes.com/wiki/List_%28computing%29.html Indexing http://kiwitobes.com/wiki/C_Sharp.html Indexing http://kiwitobes.com/wiki/Free_software.html Indexing http://kiwitobes.com/wiki/GNU_bison.html Indexing http://kiwitobes.com/wiki/Yacc.html Indexing http://kiwitobes.com/wiki/Solution_stack.html Indexing http://kiwitobes.com/wiki/If_and_only_if.html Indexing http://kiwitobes.com/wiki/Switch_statement.html Indexing http://kiwitobes.com/wiki/Bourne_shell.html Indexing http://kiwitobes.com/wiki/Glue_language.html Indexing http://kiwitobes.com/wiki/Quality_control.html Indexing http://kiwitobes.com/wiki/2001.html Indexing http://kiwitobes.com/wiki/ANSI.html Indexing http://kiwitobes.com/wiki/First-class_function.html Indexing http://kiwitobes.com/wiki/Perl_%28disambiguation%29.html Indexing http://kiwitobes.com/wiki/Hash_table.html Indexing http://kiwitobes.com/wiki/V6_%28Perl%29.html Indexing http://kiwitobes.com/wiki/The_Perl_Foundation.html Indexing http://kiwitobes.com/wiki/AmigaOS.html Indexing http://kiwitobes.com/wiki/Perl_Data_Language.html Indexing http://kiwitobes.com/wiki/Subroutine.html Indexing http://kiwitobes.com/wiki/As_of_2005.html Indexing http://kiwitobes.com/wiki/GNU_General_Public_License.html Indexing http://kiwitobes.com/wiki/Pragma.html Indexing http://kiwitobes.com/wiki/POSIX.html Indexing http://kiwitobes.com/wiki/O%27Reilly_Media.html Indexing http://kiwitobes.com/wiki/Eval.html Indexing http://kiwitobes.com/wiki/Scalar_%28computing%29.html Indexing http://kiwitobes.com/wiki/Filename_extension.html Indexing http://kiwitobes.com/wiki/Token_%28parser%29.html Indexing http://kiwitobes.com/wiki/Control_structure.html Indexing http://kiwitobes.com/wiki/Dynamic_typing.html Indexing http://kiwitobes.com/wiki/Closure_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Perl Indexing http://kiwitobes.com/wiki/Print.html Indexing http://kiwitobes.com/wiki/Microsoft_Windows.html Indexing http://kiwitobes.com/wiki/FreeBSD.html Indexing http://kiwitobes.com/wiki/Expression_%28programming%29.html Indexing http://kiwitobes.com/wiki/AWK_programming_language.html Indexing http://kiwitobes.com/wiki/Peephole_optimization.html Indexing http://kiwitobes.com/wiki/String_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Software_license.html Indexing http://kiwitobes.com/wiki/SUSE.html Indexing http://kiwitobes.com/wiki/Tar_%28file_format%29.html Indexing http://kiwitobes.com/wiki/Sigil_%28computer_programming%29.html Indexing http://kiwitobes.com/wiki/Pugs.html Indexing http://kiwitobes.com/wiki/BeOS.html Indexing http://kiwitobes.com/wiki/Comprehensive_Perl_Archive_Network.html Indexing http://kiwitobes.com/wiki/PerlScript.html Indexing http://kiwitobes.com/wiki/Gentoo_Linux.html Indexing http://kiwitobes.com/wiki/Data_structure.html Indexing http://kiwitobes.com/wiki/Object-oriented_programming.html Indexing http://kiwitobes.com/wiki/Grammar.html Indexing http://kiwitobes.com/wiki/Unix-like.html Indexing http://kiwitobes.com/wiki/Audrey_Tang.html Indexing http://kiwitobes.com/wiki/Bioinformatics.html Indexing http://kiwitobes.com/wiki/Ruby_programming_language.html Indexing http://kiwitobes.com/wiki/Backronym.html Indexing http://kiwitobes.com/wiki/Memory_management.html Indexing http://kiwitobes.com/wiki/Plain_Old_Documentation.html Indexing http://kiwitobes.com/wiki/Mod_perl.html Indexing http://kiwitobes.com/wiki/Palindrome.html Indexing http://kiwitobes.com/wiki/Unicode.html Indexing http://kiwitobes.com/wiki/Virtual_machine.html Indexing http://kiwitobes.com/wiki/Newsgroup.html Indexing http://kiwitobes.com/wiki/Lex_programming_tool.html Indexing http://kiwitobes.com/wiki/Henry_Spencer.html Indexing http://kiwitobes.com/wiki/Wikibooks.html Indexing http://kiwitobes.com/wiki/Lisp_programming_language.html Indexing http://kiwitobes.com/wiki/Assembly_language.html Indexing http://kiwitobes.com/wiki/Thread_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Golf.html Indexing http://kiwitobes.com/wiki/Code_block.html Indexing http://kiwitobes.com/wiki/1987.html Indexing http://kiwitobes.com/wiki/S-expression.html Indexing http://kiwitobes.com/wiki/Cygwin.html Indexing http://kiwitobes.com/wiki/BASIC-PLUS.html Indexing http://kiwitobes.com/wiki/Unisys.html Indexing http://kiwitobes.com/wiki/Man_page.html Indexing http://kiwitobes.com/wiki/1994.html Indexing http://kiwitobes.com/wiki/Dynamic_language.html Indexing http://kiwitobes.com/wiki/Procedural_programming.html Indexing http://kiwitobes.com/wiki/Parable_of_the_Pearl.html Indexing http://kiwitobes.com/wiki/Multi-paradigm_programming_language.html Indexing http://kiwitobes.com/wiki/Newline.html Indexing http://kiwitobes.com/wiki/Huffman_coding.html Indexing http://kiwitobes.com/wiki/Dynamic_programming_language.html Indexing http://kiwitobes.com/wiki/Procedural_programming_language.html Indexing http://kiwitobes.com/wiki/Array.html Indexing http://kiwitobes.com/wiki/Mac_OS.html Indexing http://kiwitobes.com/wiki/AWK_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/Linux.html Indexing http://kiwitobes.com/wiki/Main_Page.html Indexing http://kiwitobes.com/wiki/C_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/Evaluation_strategy Indexing http://kiwitobes.com/wiki/C%2B%2B.html Indexing http://kiwitobes.com/wiki/Mac_OS_X.html Indexing http://kiwitobes.com/wiki/LAMP_%28software_bundle%29.html Indexing http://kiwitobes.com/wiki/System_administrator.html Indexing http://kiwitobes.com/wiki/Functional_programming.html Indexing http://kiwitobes.com/wiki/Type_system.html Indexing http://kiwitobes.com/wiki/Source_code.html Indexing http://kiwitobes.com/wiki/Programming_paradigm.html Indexing http://kiwitobes.com/wiki/PEARL_programming_language.html Indexing http://kiwitobes.com/wiki/Java_programming_language.html Indexing http://kiwitobes.com/wiki/Perl_control_structures.html Indexing http://kiwitobes.com/wiki/Cross-platform.html Indexing http://kiwitobes.com/wiki/Unix.html Indexing http://kiwitobes.com/wiki/Website.html Indexing http://kiwitobes.com/wiki/Wiki_software.html Indexing http://kiwitobes.com/wiki/Perl_regular_expression_examples.html Indexing http://kiwitobes.com/wiki/Perl_Mongers.html Indexing http://kiwitobes.com/wiki/VeriSign.html Indexing http://kiwitobes.com/wiki/Shibboleth Indexing http://kiwitobes.com/wiki/Software_release.html Indexing http://kiwitobes.com/wiki/Brace.html Indexing http://kiwitobes.com/wiki/Unix_shell.html Indexing http://kiwitobes.com/wiki/As_of_2006.html Indexing http://kiwitobes.com/wiki/August_20.html Indexing http://kiwitobes.com/wiki/Artistic_License.html Indexing http://kiwitobes.com/wiki/Data_type.html Indexing http://kiwitobes.com/wiki/Site_Finder.html Indexing http://kiwitobes.com/wiki/2006.html Indexing http://kiwitobes.com/wiki/Latin.html Indexing http://kiwitobes.com/wiki/Acronym.html Indexing http://kiwitobes.com/wiki/Call_stack.html Indexing http://kiwitobes.com/wiki/Debian.html Indexing http://kiwitobes.com/wiki/GNU_Compiler_Collection.html Indexing http://kiwitobes.com/wiki/Operating_system.html Indexing http://kiwitobes.com/wiki/Randal_L._Schwartz.html Indexing http://kiwitobes.com/wiki/1995.html Indexing http://kiwitobes.com/wiki/2005.html Indexing http://kiwitobes.com/wiki/Internet_Movie_Database.html Indexing http://kiwitobes.com/wiki/Megabyte.html Indexing http://kiwitobes.com/wiki/Perl_DBI.html Indexing http://kiwitobes.com/wiki/Regular_expressions.html Indexing http://kiwitobes.com/wiki/Constant_folding.html Indexing http://kiwitobes.com/wiki/Assignment_statement.html Indexing http://kiwitobes.com/wiki/Perl_Monks.html Indexing http://kiwitobes.com/wiki/Apache_HTTP_server.html Indexing http://kiwitobes.com/wiki/Acme.html Indexing http://kiwitobes.com/wiki/Perl_Object_Environment.html Indexing http://kiwitobes.com/wiki/C_programming_language.html Indexing http://kiwitobes.com/wiki/Obfuscated_Perl_contest.html Indexing http://kiwitobes.com/wiki/Perl.html Indexing http://kiwitobes.com/wiki/Programming_Perl.html Indexing http://kiwitobes.com/wiki/October_26.html Indexing http://kiwitobes.com/wiki/Data_compression.html Indexing http://kiwitobes.com/wiki/Perl_6.html Indexing http://kiwitobes.com/wiki/Learning_Perl.html Indexing http://kiwitobes.com/wiki/Regular_expression.html Indexing http://kiwitobes.com/wiki/Freenode.html Indexing http://kiwitobes.com/wiki/Syntax.html Indexing http://kiwitobes.com/wiki/Fortran.html Indexing http://kiwitobes.com/wiki/Hacker.html Indexing http://kiwitobes.com/wiki/UseModWiki.html Indexing http://kiwitobes.com/wiki/Shell_%28computing%29.html Indexing http://kiwitobes.com/wiki/Larry_Wall.html Indexing http://kiwitobes.com/wiki/Haskell_programming_language.html Indexing http://kiwitobes.com/wiki/Associative_array.html Indexing http://kiwitobes.com/wiki/CPAN.html Indexing http://kiwitobes.com/wiki/Just_another_Perl_hacker.html Indexing http://kiwitobes.com/wiki/January_31.html Indexing http://kiwitobes.com/wiki/There%27s_more_than_one_way_to_do_it.html Indexing http://kiwitobes.com/wiki/PHP.html Indexing http://kiwitobes.com/wiki/Slash_%28weblog_system%29.html Indexing http://kiwitobes.com/wiki/Reference_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Object_oriented_programming.html Indexing http://kiwitobes.com/wiki/Perl_interpreter.html Indexing http://kiwitobes.com/wiki/Variable.html Indexing http://kiwitobes.com/wiki/Obfuscated_code.html Indexing http://kiwitobes.com/wiki/December_18.html Indexing http://kiwitobes.com/wiki/Perl_Cookbook.html Indexing http://kiwitobes.com/wiki/Benchmark_%28computing%29.html Indexing http://kiwitobes.com/wiki/Sed.html Indexing http://kiwitobes.com/wiki/Unix_manual.html Indexing http://kiwitobes.com/wiki/Tail_call.html Indexing http://kiwitobes.com/wiki/Backtracking.html Indexing http://kiwitobes.com/wiki/October_17.html Indexing http://kiwitobes.com/wiki/Shebang_%28Unix%29.html Indexing http://kiwitobes.com/wiki/XS_%28Perl%29.html Indexing http://kiwitobes.com/wiki/Hello_world.html Indexing http://kiwitobes.com/wiki/Parrot_virtual_machine.html Indexing http://kiwitobes.com/wiki/Python_programming_language.html Indexing http://kiwitobes.com/wiki/Comparison_of_programming_languages.html Indexing http://kiwitobes.com/wiki/Comment.html Indexing http://kiwitobes.com/wiki/SQL.html Indexing http://kiwitobes.com/wiki/Pascal_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/PCRE.html
インデックスを作ってみる
ここでは、ページに対する単語によるインデックスを作成する。 まずsqliteのインストールから始める。一旦、テーブル作成とインデックス設定を行う。$sudo apt-get install python-dev $sudo pip install pysqlite
#!/usr/bin/python from pysqlite2 import dbapi2 as sqlite import urllib3 from BeautifulSoup import * from urlparse import urljoin # Make ignore word set ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class crawler: def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() def getentryid(self, table, field, value, createnew=True): return None def addtoindex(self,url,soup): print 'Indexing %s' % url def gettextonly(self, soup): return None def separatewords(self, text): return None def isindexed(self,url): return False def addlinkref(self, urlFrom, urlTo, linkText): pass def createindextables(self): pass def crawl(self, pages, depth=2): # Loop with depth for i in range(depth): newpages = set() for page in pages: # Download page's contents try: http = urllib3.PoolManager() contents = http.request('GET', page).data except: print "Could not open %s" % page continue # Register index soup = BeautifulSoup(contents) self.addtoindex(page, soup) # Crawl links links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # delete anchor # Add page as a new page if the page is not added if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer, toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() if __name__ == '__main__': pagelist=['http://kiwitobes.com/wiki/Perl.html'] crawler = crawler('searchindex.db') crawler.createindextables() #crawler.crawl(pagelist)
ページ内の単語を抽出
以下の手順で行う。- HTMLからタグを排除して、文字列だけ取り出す。
- [a-zA-Z0-9+]を一つの単語として、分割してリストを生成する。
- (この記事の範囲外)各単語に対してステミングを行い、語幹のみのリストを生成する。
#!/usr/bin/python from pysqlite2 import dbapi2 as sqlite import urllib3 from BeautifulSoup import * from urlparse import urljoin # Make ignore word set ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class crawler: def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Get id from table def getentryid(self, table, field, value, createnew=True): cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value)) res = cur.fetchone() if res == None: cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing %s' % url # Get words text = self.gettextonly(soup) words = self.separatewords(text) # Get URL id urlid = self.getentryid('urllist','url',url) # Connect words and url for i in range(len(words)): word = words[i] if word in ignorewords: continue wordid = self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Delete unusuful part def gettextonly(self, soup): v = soup.string if v==None: c = soup.contents resulttext = '' for t in c: subtext = self.gettextonly(t) resulttext += subtext + '\n' return resulttext else: return v.strip() # Split words from long text def separatewords(self, text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] # Check url already registered def isindexed(self,url): u = self.con.execute("select rowid from urllist where url='%s'" % url).fetchone() if u != None: v = self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone() if v != None: return True return False # Make relationship between pages def addlinkref(self, urlFrom, urlTo, linkText): words = self.separatewords(linkText) fromid = self.getentryid('urllist', 'url', urlFrom) toid = self.getentryid('urllist', 'url', urlTo) if fromid == toid: return cur =self.con.execute("insert into link(fromid, toid) values (%d,%d)" % (fromid, toid)) linkid = cur.lastrowid for word in words: if word in ignorewords: continue wordid = self.getentryid('wordlist', 'word', word) self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid)) def createindextables(self): pass def crawl(self, pages, depth=2): # Loop with depth for i in range(depth): newpages = set() for page in pages: # Download page's contents try: http = urllib3.PoolManager() contents = http.request('GET', page).data except: print "Could not open %s" % page continue # Register index soup = BeautifulSoup(contents) self.addtoindex(page, soup) # Crawl links links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # delete anchor # Add page as a new page if the page is not added if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer, toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() if __name__ == '__main__': pagelist=['http://kiwitobes.com/wiki/Perl.html'] crawler = crawler('searchindex.db') crawler.crawl(pagelist) print [row for row in crawler.con.execute('select rowid from wordlocation where wordid=1')]
リファレンス
この記事は以下の本を元に知人に解説するために説明の仕方を変えて記述したものである。なお、1章と2章は公開されている。 集合知プログラミング1,2章
【楽天ブックスならいつでも送料無料】集合知プログラミング [ トビ-・セガラン ] 価格:3,672円(税込、送料込) |