$sudo apt-get install python-pip $sudo pip install urllib3 $sudo pip install beautifulsoup
これを実行すると以下の出力が得られる。#!/usr/bin/python import urllib3 from BeautifulSoup import * from urlparse import urljoin # Make ignore word set ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class crawler: def __init__(self,dbname): pass def __del__(self): pass def dbcommit(self): pass def getentryid(self, table, field, value, createnew=True): return None def addtoindex(self,url,soup): print 'Indexing %s' % url def gettextonly(self, soup): return None def separatewords(self, text): return None def isindexed(self,url): return False def addlinkref(self, urlFrom, urlTo, linkText): pass def createindextables(self): pass def crawl(self, pages, depth=2): # Loop with depth for i in range(depth): newpages = set() for page in pages: # Download page's contents try: http = urllib3.PoolManager() contents = http.request('GET', page).data except: print "Could not open %s" % page continue # Register index soup = BeautifulSoup(contents) self.addtoindex(page, soup) # Crawl links links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # delete anchor if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages if __name__ == '__main__': pagelist=['http://kiwitobes.com/wiki/Perl.html'] crawler = crawler('') crawler.crawl(pagelist)
Indexing http://kiwitobes.com/wiki/Perl.html Indexing http://kiwitobes.com/wiki/Module_%28programming%29.html Indexing http://kiwitobes.com/wiki/Open_Directory_Project.html Indexing http://kiwitobes.com/wiki/Common_Gateway_Interface.html Indexing http://kiwitobes.com/wiki/List_%28computing%29.html Indexing http://kiwitobes.com/wiki/C_Sharp.html Indexing http://kiwitobes.com/wiki/Free_software.html Indexing http://kiwitobes.com/wiki/GNU_bison.html Indexing http://kiwitobes.com/wiki/Yacc.html Indexing http://kiwitobes.com/wiki/Solution_stack.html Indexing http://kiwitobes.com/wiki/If_and_only_if.html Indexing http://kiwitobes.com/wiki/Switch_statement.html Indexing http://kiwitobes.com/wiki/Bourne_shell.html Indexing http://kiwitobes.com/wiki/Glue_language.html Indexing http://kiwitobes.com/wiki/Quality_control.html Indexing http://kiwitobes.com/wiki/2001.html Indexing http://kiwitobes.com/wiki/ANSI.html Indexing http://kiwitobes.com/wiki/First-class_function.html Indexing http://kiwitobes.com/wiki/Perl_%28disambiguation%29.html Indexing http://kiwitobes.com/wiki/Hash_table.html Indexing http://kiwitobes.com/wiki/V6_%28Perl%29.html Indexing http://kiwitobes.com/wiki/The_Perl_Foundation.html Indexing http://kiwitobes.com/wiki/AmigaOS.html Indexing http://kiwitobes.com/wiki/Perl_Data_Language.html Indexing http://kiwitobes.com/wiki/Subroutine.html Indexing http://kiwitobes.com/wiki/As_of_2005.html Indexing http://kiwitobes.com/wiki/GNU_General_Public_License.html Indexing http://kiwitobes.com/wiki/Pragma.html Indexing http://kiwitobes.com/wiki/POSIX.html Indexing http://kiwitobes.com/wiki/O%27Reilly_Media.html Indexing http://kiwitobes.com/wiki/Eval.html Indexing http://kiwitobes.com/wiki/Scalar_%28computing%29.html Indexing http://kiwitobes.com/wiki/Filename_extension.html Indexing http://kiwitobes.com/wiki/Token_%28parser%29.html Indexing http://kiwitobes.com/wiki/Control_structure.html Indexing http://kiwitobes.com/wiki/Dynamic_typing.html Indexing http://kiwitobes.com/wiki/Closure_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Perl Indexing http://kiwitobes.com/wiki/Print.html Indexing http://kiwitobes.com/wiki/Microsoft_Windows.html Indexing http://kiwitobes.com/wiki/FreeBSD.html Indexing http://kiwitobes.com/wiki/Expression_%28programming%29.html Indexing http://kiwitobes.com/wiki/AWK_programming_language.html Indexing http://kiwitobes.com/wiki/Peephole_optimization.html Indexing http://kiwitobes.com/wiki/String_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Software_license.html Indexing http://kiwitobes.com/wiki/SUSE.html Indexing http://kiwitobes.com/wiki/Tar_%28file_format%29.html Indexing http://kiwitobes.com/wiki/Sigil_%28computer_programming%29.html Indexing http://kiwitobes.com/wiki/Pugs.html Indexing http://kiwitobes.com/wiki/BeOS.html Indexing http://kiwitobes.com/wiki/Comprehensive_Perl_Archive_Network.html Indexing http://kiwitobes.com/wiki/PerlScript.html Indexing http://kiwitobes.com/wiki/Gentoo_Linux.html Indexing http://kiwitobes.com/wiki/Data_structure.html Indexing http://kiwitobes.com/wiki/Object-oriented_programming.html Indexing http://kiwitobes.com/wiki/Grammar.html Indexing http://kiwitobes.com/wiki/Unix-like.html Indexing http://kiwitobes.com/wiki/Audrey_Tang.html Indexing http://kiwitobes.com/wiki/Bioinformatics.html Indexing http://kiwitobes.com/wiki/Ruby_programming_language.html Indexing http://kiwitobes.com/wiki/Backronym.html Indexing http://kiwitobes.com/wiki/Memory_management.html Indexing http://kiwitobes.com/wiki/Plain_Old_Documentation.html Indexing http://kiwitobes.com/wiki/Mod_perl.html Indexing http://kiwitobes.com/wiki/Palindrome.html Indexing http://kiwitobes.com/wiki/Unicode.html Indexing http://kiwitobes.com/wiki/Virtual_machine.html Indexing http://kiwitobes.com/wiki/Newsgroup.html Indexing http://kiwitobes.com/wiki/Lex_programming_tool.html Indexing http://kiwitobes.com/wiki/Henry_Spencer.html Indexing http://kiwitobes.com/wiki/Wikibooks.html Indexing http://kiwitobes.com/wiki/Lisp_programming_language.html Indexing http://kiwitobes.com/wiki/Assembly_language.html Indexing http://kiwitobes.com/wiki/Thread_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Golf.html Indexing http://kiwitobes.com/wiki/Code_block.html Indexing http://kiwitobes.com/wiki/1987.html Indexing http://kiwitobes.com/wiki/S-expression.html Indexing http://kiwitobes.com/wiki/Cygwin.html Indexing http://kiwitobes.com/wiki/BASIC-PLUS.html Indexing http://kiwitobes.com/wiki/Unisys.html Indexing http://kiwitobes.com/wiki/Man_page.html Indexing http://kiwitobes.com/wiki/1994.html Indexing http://kiwitobes.com/wiki/Dynamic_language.html Indexing http://kiwitobes.com/wiki/Procedural_programming.html Indexing http://kiwitobes.com/wiki/Parable_of_the_Pearl.html Indexing http://kiwitobes.com/wiki/Multi-paradigm_programming_language.html Indexing http://kiwitobes.com/wiki/Newline.html Indexing http://kiwitobes.com/wiki/Huffman_coding.html Indexing http://kiwitobes.com/wiki/Dynamic_programming_language.html Indexing http://kiwitobes.com/wiki/Procedural_programming_language.html Indexing http://kiwitobes.com/wiki/Array.html Indexing http://kiwitobes.com/wiki/Mac_OS.html Indexing http://kiwitobes.com/wiki/AWK_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/Linux.html Indexing http://kiwitobes.com/wiki/Main_Page.html Indexing http://kiwitobes.com/wiki/C_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/Evaluation_strategy Indexing http://kiwitobes.com/wiki/C%2B%2B.html Indexing http://kiwitobes.com/wiki/Mac_OS_X.html Indexing http://kiwitobes.com/wiki/LAMP_%28software_bundle%29.html Indexing http://kiwitobes.com/wiki/System_administrator.html Indexing http://kiwitobes.com/wiki/Functional_programming.html Indexing http://kiwitobes.com/wiki/Type_system.html Indexing http://kiwitobes.com/wiki/Source_code.html Indexing http://kiwitobes.com/wiki/Programming_paradigm.html Indexing http://kiwitobes.com/wiki/PEARL_programming_language.html Indexing http://kiwitobes.com/wiki/Java_programming_language.html Indexing http://kiwitobes.com/wiki/Perl_control_structures.html Indexing http://kiwitobes.com/wiki/Cross-platform.html Indexing http://kiwitobes.com/wiki/Unix.html Indexing http://kiwitobes.com/wiki/Website.html Indexing http://kiwitobes.com/wiki/Wiki_software.html Indexing http://kiwitobes.com/wiki/Perl_regular_expression_examples.html Indexing http://kiwitobes.com/wiki/Perl_Mongers.html Indexing http://kiwitobes.com/wiki/VeriSign.html Indexing http://kiwitobes.com/wiki/Shibboleth Indexing http://kiwitobes.com/wiki/Software_release.html Indexing http://kiwitobes.com/wiki/Brace.html Indexing http://kiwitobes.com/wiki/Unix_shell.html Indexing http://kiwitobes.com/wiki/As_of_2006.html Indexing http://kiwitobes.com/wiki/August_20.html Indexing http://kiwitobes.com/wiki/Artistic_License.html Indexing http://kiwitobes.com/wiki/Data_type.html Indexing http://kiwitobes.com/wiki/Site_Finder.html Indexing http://kiwitobes.com/wiki/2006.html Indexing http://kiwitobes.com/wiki/Latin.html Indexing http://kiwitobes.com/wiki/Acronym.html Indexing http://kiwitobes.com/wiki/Call_stack.html Indexing http://kiwitobes.com/wiki/Debian.html Indexing http://kiwitobes.com/wiki/GNU_Compiler_Collection.html Indexing http://kiwitobes.com/wiki/Operating_system.html Indexing http://kiwitobes.com/wiki/Randal_L._Schwartz.html Indexing http://kiwitobes.com/wiki/1995.html Indexing http://kiwitobes.com/wiki/2005.html Indexing http://kiwitobes.com/wiki/Internet_Movie_Database.html Indexing http://kiwitobes.com/wiki/Megabyte.html Indexing http://kiwitobes.com/wiki/Perl_DBI.html Indexing http://kiwitobes.com/wiki/Regular_expressions.html Indexing http://kiwitobes.com/wiki/Constant_folding.html Indexing http://kiwitobes.com/wiki/Assignment_statement.html Indexing http://kiwitobes.com/wiki/Perl_Monks.html Indexing http://kiwitobes.com/wiki/Apache_HTTP_server.html Indexing http://kiwitobes.com/wiki/Acme.html Indexing http://kiwitobes.com/wiki/Perl_Object_Environment.html Indexing http://kiwitobes.com/wiki/C_programming_language.html Indexing http://kiwitobes.com/wiki/Obfuscated_Perl_contest.html Indexing http://kiwitobes.com/wiki/Perl.html Indexing http://kiwitobes.com/wiki/Programming_Perl.html Indexing http://kiwitobes.com/wiki/October_26.html Indexing http://kiwitobes.com/wiki/Data_compression.html Indexing http://kiwitobes.com/wiki/Perl_6.html Indexing http://kiwitobes.com/wiki/Learning_Perl.html Indexing http://kiwitobes.com/wiki/Regular_expression.html Indexing http://kiwitobes.com/wiki/Freenode.html Indexing http://kiwitobes.com/wiki/Syntax.html Indexing http://kiwitobes.com/wiki/Fortran.html Indexing http://kiwitobes.com/wiki/Hacker.html Indexing http://kiwitobes.com/wiki/UseModWiki.html Indexing http://kiwitobes.com/wiki/Shell_%28computing%29.html Indexing http://kiwitobes.com/wiki/Larry_Wall.html Indexing http://kiwitobes.com/wiki/Haskell_programming_language.html Indexing http://kiwitobes.com/wiki/Associative_array.html Indexing http://kiwitobes.com/wiki/CPAN.html Indexing http://kiwitobes.com/wiki/Just_another_Perl_hacker.html Indexing http://kiwitobes.com/wiki/January_31.html Indexing http://kiwitobes.com/wiki/There%27s_more_than_one_way_to_do_it.html Indexing http://kiwitobes.com/wiki/PHP.html Indexing http://kiwitobes.com/wiki/Slash_%28weblog_system%29.html Indexing http://kiwitobes.com/wiki/Reference_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Object_oriented_programming.html Indexing http://kiwitobes.com/wiki/Perl_interpreter.html Indexing http://kiwitobes.com/wiki/Variable.html Indexing http://kiwitobes.com/wiki/Obfuscated_code.html Indexing http://kiwitobes.com/wiki/December_18.html Indexing http://kiwitobes.com/wiki/Perl_Cookbook.html Indexing http://kiwitobes.com/wiki/Benchmark_%28computing%29.html Indexing http://kiwitobes.com/wiki/Sed.html Indexing http://kiwitobes.com/wiki/Unix_manual.html Indexing http://kiwitobes.com/wiki/Tail_call.html Indexing http://kiwitobes.com/wiki/Backtracking.html Indexing http://kiwitobes.com/wiki/October_17.html Indexing http://kiwitobes.com/wiki/Shebang_%28Unix%29.html Indexing http://kiwitobes.com/wiki/XS_%28Perl%29.html Indexing http://kiwitobes.com/wiki/Hello_world.html Indexing http://kiwitobes.com/wiki/Parrot_virtual_machine.html Indexing http://kiwitobes.com/wiki/Python_programming_language.html Indexing http://kiwitobes.com/wiki/Comparison_of_programming_languages.html Indexing http://kiwitobes.com/wiki/Comment.html Indexing http://kiwitobes.com/wiki/SQL.html Indexing http://kiwitobes.com/wiki/Pascal_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/PCRE.html
一旦、テーブル作成とインデックス設定を行う。$sudo apt-get install python-dev $sudo pip install pysqlite
#!/usr/bin/python from pysqlite2 import dbapi2 as sqlite import urllib3 from BeautifulSoup import * from urlparse import urljoin # Make ignore word set ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class crawler: def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() def getentryid(self, table, field, value, createnew=True): return None def addtoindex(self,url,soup): print 'Indexing %s' % url def gettextonly(self, soup): return None def separatewords(self, text): return None def isindexed(self,url): return False def addlinkref(self, urlFrom, urlTo, linkText): pass def createindextables(self): pass def crawl(self, pages, depth=2): # Loop with depth for i in range(depth): newpages = set() for page in pages: # Download page's contents try: http = urllib3.PoolManager() contents = http.request('GET', page).data except: print "Could not open %s" % page continue # Register index soup = BeautifulSoup(contents) self.addtoindex(page, soup) # Crawl links links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # delete anchor # Add page as a new page if the page is not added if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer, toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() if __name__ == '__main__': pagelist=['http://kiwitobes.com/wiki/Perl.html'] crawler = crawler('searchindex.db') crawler.createindextables() #crawler.crawl(pagelist)
#!/usr/bin/python from pysqlite2 import dbapi2 as sqlite import urllib3 from BeautifulSoup import * from urlparse import urljoin # Make ignore word set ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it']) class crawler: def __init__(self,dbname): self.con=sqlite.connect(dbname) def __del__(self): self.con.close() def dbcommit(self): self.con.commit() # Get id from table def getentryid(self, table, field, value, createnew=True): cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value)) res = cur.fetchone() if res == None: cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value)) return cur.lastrowid else: return res[0] def addtoindex(self,url,soup): if self.isindexed(url): return print 'Indexing %s' % url # Get words text = self.gettextonly(soup) words = self.separatewords(text) # Get URL id urlid = self.getentryid('urllist','url',url) # Connect words and url for i in range(len(words)): word = words[i] if word in ignorewords: continue wordid = self.getentryid('wordlist','word',word) self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i)) # Delete unusuful part def gettextonly(self, soup): v = soup.string if v==None: c = soup.contents resulttext = '' for t in c: subtext = self.gettextonly(t) resulttext += subtext + '\n' return resulttext else: return v.strip() # Split words from long text def separatewords(self, text): splitter = re.compile('\\W*') return [s.lower() for s in splitter.split(text) if s!=''] # Check url already registered def isindexed(self,url): u = self.con.execute("select rowid from urllist where url='%s'" % url).fetchone() if u != None: v = self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone() if v != None: return True return False # Make relationship between pages def addlinkref(self, urlFrom, urlTo, linkText): words = self.separatewords(linkText) fromid = self.getentryid('urllist', 'url', urlFrom) toid = self.getentryid('urllist', 'url', urlTo) if fromid == toid: return cur =self.con.execute("insert into link(fromid, toid) values (%d,%d)" % (fromid, toid)) linkid = cur.lastrowid for word in words: if word in ignorewords: continue wordid = self.getentryid('wordlist', 'word', word) self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid)) def createindextables(self): pass def crawl(self, pages, depth=2): # Loop with depth for i in range(depth): newpages = set() for page in pages: # Download page's contents try: http = urllib3.PoolManager() contents = http.request('GET', page).data except: print "Could not open %s" % page continue # Register index soup = BeautifulSoup(contents) self.addtoindex(page, soup) # Crawl links links = soup('a') for link in links: if ('href' in dict(link.attrs)): url = urljoin(page, link['href']) if url.find("'") != -1: continue url = url.split('#')[0] # delete anchor # Add page as a new page if the page is not added if url[0:4] == 'http' and not self.isindexed(url): newpages.add(url) linkText = self.gettextonly(link) self.addlinkref(page, url, linkText) self.dbcommit() pages = newpages def createindextables(self): self.con.execute('create table urllist(url)') self.con.execute('create table wordlist(word)') self.con.execute('create table wordlocation(urlid,wordid,location)') self.con.execute('create table link(fromid integer, toid integer)') self.con.execute('create table linkwords(wordid,linkid)') self.con.execute('create index wordidx on wordlist(word)') self.con.execute('create index urlidx on urllist(url)') self.con.execute('create index urltoidx on link(toid)') self.con.execute('create index urlfromidx on link(fromid)') self.dbcommit() if __name__ == '__main__': pagelist=['http://kiwitobes.com/wiki/Perl.html'] crawler = crawler('searchindex.db') crawler.crawl(pagelist) print [row for row in crawler.con.execute('select rowid from wordlocation where wordid=1')]
【楽天ブックスならいつでも送料無料】集合知プログラミング [ トビ-・セガラン ] 価格:3,672円(税込、送料込) |