環境準備(Ubuntu)
ここでは、Webページをダウンロードするためにurllib3というライブラリを使用し、Webページの解釈のためにBeautifulSoupというライブラリを使用する。
以下のコマンドを叩くことでそれらのライブラリをインストールすることが出来る。
$sudo apt-get install python-pip $sudo pip install urllib3 $sudo pip install beautifulsoup
クローラーを作る
初期ページのURLを与えたら、自動的にリンクを探して各ページを巡回するクローラーを作ってみる。まずは巡回のみ
まずは、リンクを巡回する部分だ。#!/usr/bin/python
import urllib3
from BeautifulSoup import *
from urlparse import urljoin
# Make ignore word set
ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
class crawler:
def __init__(self,dbname):
pass
def __del__(self):
pass
def dbcommit(self):
pass
def getentryid(self, table, field, value, createnew=True):
return None
def addtoindex(self,url,soup):
print 'Indexing %s' % url
def gettextonly(self, soup):
return None
def separatewords(self, text):
return None
def isindexed(self,url):
return False
def addlinkref(self, urlFrom, urlTo, linkText):
pass
def createindextables(self):
pass
def crawl(self, pages, depth=2):
# Loop with depth
for i in range(depth):
newpages = set()
for page in pages:
# Download page's contents
try:
http = urllib3.PoolManager()
contents = http.request('GET', page).data
except:
print "Could not open %s" % page
continue
# Register index
soup = BeautifulSoup(contents)
self.addtoindex(page, soup)
# Crawl links
links = soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url = urljoin(page, link['href'])
if url.find("'") != -1: continue
url = url.split('#')[0] # delete anchor
if url[0:4] == 'http' and not self.isindexed(url):
newpages.add(url)
linkText = self.gettextonly(link)
self.addlinkref(page, url, linkText)
self.dbcommit()
pages = newpages
if __name__ == '__main__':
pagelist=['http://kiwitobes.com/wiki/Perl.html']
crawler = crawler('')
crawler.crawl(pagelist)
これを実行すると以下の出力が得られる。
Indexing http://kiwitobes.com/wiki/Perl.html Indexing http://kiwitobes.com/wiki/Module_%28programming%29.html Indexing http://kiwitobes.com/wiki/Open_Directory_Project.html Indexing http://kiwitobes.com/wiki/Common_Gateway_Interface.html Indexing http://kiwitobes.com/wiki/List_%28computing%29.html Indexing http://kiwitobes.com/wiki/C_Sharp.html Indexing http://kiwitobes.com/wiki/Free_software.html Indexing http://kiwitobes.com/wiki/GNU_bison.html Indexing http://kiwitobes.com/wiki/Yacc.html Indexing http://kiwitobes.com/wiki/Solution_stack.html Indexing http://kiwitobes.com/wiki/If_and_only_if.html Indexing http://kiwitobes.com/wiki/Switch_statement.html Indexing http://kiwitobes.com/wiki/Bourne_shell.html Indexing http://kiwitobes.com/wiki/Glue_language.html Indexing http://kiwitobes.com/wiki/Quality_control.html Indexing http://kiwitobes.com/wiki/2001.html Indexing http://kiwitobes.com/wiki/ANSI.html Indexing http://kiwitobes.com/wiki/First-class_function.html Indexing http://kiwitobes.com/wiki/Perl_%28disambiguation%29.html Indexing http://kiwitobes.com/wiki/Hash_table.html Indexing http://kiwitobes.com/wiki/V6_%28Perl%29.html Indexing http://kiwitobes.com/wiki/The_Perl_Foundation.html Indexing http://kiwitobes.com/wiki/AmigaOS.html Indexing http://kiwitobes.com/wiki/Perl_Data_Language.html Indexing http://kiwitobes.com/wiki/Subroutine.html Indexing http://kiwitobes.com/wiki/As_of_2005.html Indexing http://kiwitobes.com/wiki/GNU_General_Public_License.html Indexing http://kiwitobes.com/wiki/Pragma.html Indexing http://kiwitobes.com/wiki/POSIX.html Indexing http://kiwitobes.com/wiki/O%27Reilly_Media.html Indexing http://kiwitobes.com/wiki/Eval.html Indexing http://kiwitobes.com/wiki/Scalar_%28computing%29.html Indexing http://kiwitobes.com/wiki/Filename_extension.html Indexing http://kiwitobes.com/wiki/Token_%28parser%29.html Indexing http://kiwitobes.com/wiki/Control_structure.html Indexing http://kiwitobes.com/wiki/Dynamic_typing.html Indexing http://kiwitobes.com/wiki/Closure_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Perl Indexing http://kiwitobes.com/wiki/Print.html Indexing http://kiwitobes.com/wiki/Microsoft_Windows.html Indexing http://kiwitobes.com/wiki/FreeBSD.html Indexing http://kiwitobes.com/wiki/Expression_%28programming%29.html Indexing http://kiwitobes.com/wiki/AWK_programming_language.html Indexing http://kiwitobes.com/wiki/Peephole_optimization.html Indexing http://kiwitobes.com/wiki/String_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Software_license.html Indexing http://kiwitobes.com/wiki/SUSE.html Indexing http://kiwitobes.com/wiki/Tar_%28file_format%29.html Indexing http://kiwitobes.com/wiki/Sigil_%28computer_programming%29.html Indexing http://kiwitobes.com/wiki/Pugs.html Indexing http://kiwitobes.com/wiki/BeOS.html Indexing http://kiwitobes.com/wiki/Comprehensive_Perl_Archive_Network.html Indexing http://kiwitobes.com/wiki/PerlScript.html Indexing http://kiwitobes.com/wiki/Gentoo_Linux.html Indexing http://kiwitobes.com/wiki/Data_structure.html Indexing http://kiwitobes.com/wiki/Object-oriented_programming.html Indexing http://kiwitobes.com/wiki/Grammar.html Indexing http://kiwitobes.com/wiki/Unix-like.html Indexing http://kiwitobes.com/wiki/Audrey_Tang.html Indexing http://kiwitobes.com/wiki/Bioinformatics.html Indexing http://kiwitobes.com/wiki/Ruby_programming_language.html Indexing http://kiwitobes.com/wiki/Backronym.html Indexing http://kiwitobes.com/wiki/Memory_management.html Indexing http://kiwitobes.com/wiki/Plain_Old_Documentation.html Indexing http://kiwitobes.com/wiki/Mod_perl.html Indexing http://kiwitobes.com/wiki/Palindrome.html Indexing http://kiwitobes.com/wiki/Unicode.html Indexing http://kiwitobes.com/wiki/Virtual_machine.html Indexing http://kiwitobes.com/wiki/Newsgroup.html Indexing http://kiwitobes.com/wiki/Lex_programming_tool.html Indexing http://kiwitobes.com/wiki/Henry_Spencer.html Indexing http://kiwitobes.com/wiki/Wikibooks.html Indexing http://kiwitobes.com/wiki/Lisp_programming_language.html Indexing http://kiwitobes.com/wiki/Assembly_language.html Indexing http://kiwitobes.com/wiki/Thread_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Golf.html Indexing http://kiwitobes.com/wiki/Code_block.html Indexing http://kiwitobes.com/wiki/1987.html Indexing http://kiwitobes.com/wiki/S-expression.html Indexing http://kiwitobes.com/wiki/Cygwin.html Indexing http://kiwitobes.com/wiki/BASIC-PLUS.html Indexing http://kiwitobes.com/wiki/Unisys.html Indexing http://kiwitobes.com/wiki/Man_page.html Indexing http://kiwitobes.com/wiki/1994.html Indexing http://kiwitobes.com/wiki/Dynamic_language.html Indexing http://kiwitobes.com/wiki/Procedural_programming.html Indexing http://kiwitobes.com/wiki/Parable_of_the_Pearl.html Indexing http://kiwitobes.com/wiki/Multi-paradigm_programming_language.html Indexing http://kiwitobes.com/wiki/Newline.html Indexing http://kiwitobes.com/wiki/Huffman_coding.html Indexing http://kiwitobes.com/wiki/Dynamic_programming_language.html Indexing http://kiwitobes.com/wiki/Procedural_programming_language.html Indexing http://kiwitobes.com/wiki/Array.html Indexing http://kiwitobes.com/wiki/Mac_OS.html Indexing http://kiwitobes.com/wiki/AWK_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/Linux.html Indexing http://kiwitobes.com/wiki/Main_Page.html Indexing http://kiwitobes.com/wiki/C_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/Evaluation_strategy Indexing http://kiwitobes.com/wiki/C%2B%2B.html Indexing http://kiwitobes.com/wiki/Mac_OS_X.html Indexing http://kiwitobes.com/wiki/LAMP_%28software_bundle%29.html Indexing http://kiwitobes.com/wiki/System_administrator.html Indexing http://kiwitobes.com/wiki/Functional_programming.html Indexing http://kiwitobes.com/wiki/Type_system.html Indexing http://kiwitobes.com/wiki/Source_code.html Indexing http://kiwitobes.com/wiki/Programming_paradigm.html Indexing http://kiwitobes.com/wiki/PEARL_programming_language.html Indexing http://kiwitobes.com/wiki/Java_programming_language.html Indexing http://kiwitobes.com/wiki/Perl_control_structures.html Indexing http://kiwitobes.com/wiki/Cross-platform.html Indexing http://kiwitobes.com/wiki/Unix.html Indexing http://kiwitobes.com/wiki/Website.html Indexing http://kiwitobes.com/wiki/Wiki_software.html Indexing http://kiwitobes.com/wiki/Perl_regular_expression_examples.html Indexing http://kiwitobes.com/wiki/Perl_Mongers.html Indexing http://kiwitobes.com/wiki/VeriSign.html Indexing http://kiwitobes.com/wiki/Shibboleth Indexing http://kiwitobes.com/wiki/Software_release.html Indexing http://kiwitobes.com/wiki/Brace.html Indexing http://kiwitobes.com/wiki/Unix_shell.html Indexing http://kiwitobes.com/wiki/As_of_2006.html Indexing http://kiwitobes.com/wiki/August_20.html Indexing http://kiwitobes.com/wiki/Artistic_License.html Indexing http://kiwitobes.com/wiki/Data_type.html Indexing http://kiwitobes.com/wiki/Site_Finder.html Indexing http://kiwitobes.com/wiki/2006.html Indexing http://kiwitobes.com/wiki/Latin.html Indexing http://kiwitobes.com/wiki/Acronym.html Indexing http://kiwitobes.com/wiki/Call_stack.html Indexing http://kiwitobes.com/wiki/Debian.html Indexing http://kiwitobes.com/wiki/GNU_Compiler_Collection.html Indexing http://kiwitobes.com/wiki/Operating_system.html Indexing http://kiwitobes.com/wiki/Randal_L._Schwartz.html Indexing http://kiwitobes.com/wiki/1995.html Indexing http://kiwitobes.com/wiki/2005.html Indexing http://kiwitobes.com/wiki/Internet_Movie_Database.html Indexing http://kiwitobes.com/wiki/Megabyte.html Indexing http://kiwitobes.com/wiki/Perl_DBI.html Indexing http://kiwitobes.com/wiki/Regular_expressions.html Indexing http://kiwitobes.com/wiki/Constant_folding.html Indexing http://kiwitobes.com/wiki/Assignment_statement.html Indexing http://kiwitobes.com/wiki/Perl_Monks.html Indexing http://kiwitobes.com/wiki/Apache_HTTP_server.html Indexing http://kiwitobes.com/wiki/Acme.html Indexing http://kiwitobes.com/wiki/Perl_Object_Environment.html Indexing http://kiwitobes.com/wiki/C_programming_language.html Indexing http://kiwitobes.com/wiki/Obfuscated_Perl_contest.html Indexing http://kiwitobes.com/wiki/Perl.html Indexing http://kiwitobes.com/wiki/Programming_Perl.html Indexing http://kiwitobes.com/wiki/October_26.html Indexing http://kiwitobes.com/wiki/Data_compression.html Indexing http://kiwitobes.com/wiki/Perl_6.html Indexing http://kiwitobes.com/wiki/Learning_Perl.html Indexing http://kiwitobes.com/wiki/Regular_expression.html Indexing http://kiwitobes.com/wiki/Freenode.html Indexing http://kiwitobes.com/wiki/Syntax.html Indexing http://kiwitobes.com/wiki/Fortran.html Indexing http://kiwitobes.com/wiki/Hacker.html Indexing http://kiwitobes.com/wiki/UseModWiki.html Indexing http://kiwitobes.com/wiki/Shell_%28computing%29.html Indexing http://kiwitobes.com/wiki/Larry_Wall.html Indexing http://kiwitobes.com/wiki/Haskell_programming_language.html Indexing http://kiwitobes.com/wiki/Associative_array.html Indexing http://kiwitobes.com/wiki/CPAN.html Indexing http://kiwitobes.com/wiki/Just_another_Perl_hacker.html Indexing http://kiwitobes.com/wiki/January_31.html Indexing http://kiwitobes.com/wiki/There%27s_more_than_one_way_to_do_it.html Indexing http://kiwitobes.com/wiki/PHP.html Indexing http://kiwitobes.com/wiki/Slash_%28weblog_system%29.html Indexing http://kiwitobes.com/wiki/Reference_%28computer_science%29.html Indexing http://kiwitobes.com/wiki/Object_oriented_programming.html Indexing http://kiwitobes.com/wiki/Perl_interpreter.html Indexing http://kiwitobes.com/wiki/Variable.html Indexing http://kiwitobes.com/wiki/Obfuscated_code.html Indexing http://kiwitobes.com/wiki/December_18.html Indexing http://kiwitobes.com/wiki/Perl_Cookbook.html Indexing http://kiwitobes.com/wiki/Benchmark_%28computing%29.html Indexing http://kiwitobes.com/wiki/Sed.html Indexing http://kiwitobes.com/wiki/Unix_manual.html Indexing http://kiwitobes.com/wiki/Tail_call.html Indexing http://kiwitobes.com/wiki/Backtracking.html Indexing http://kiwitobes.com/wiki/October_17.html Indexing http://kiwitobes.com/wiki/Shebang_%28Unix%29.html Indexing http://kiwitobes.com/wiki/XS_%28Perl%29.html Indexing http://kiwitobes.com/wiki/Hello_world.html Indexing http://kiwitobes.com/wiki/Parrot_virtual_machine.html Indexing http://kiwitobes.com/wiki/Python_programming_language.html Indexing http://kiwitobes.com/wiki/Comparison_of_programming_languages.html Indexing http://kiwitobes.com/wiki/Comment.html Indexing http://kiwitobes.com/wiki/SQL.html Indexing http://kiwitobes.com/wiki/Pascal_%28programming_language%29.html Indexing http://kiwitobes.com/wiki/PCRE.html
インデックスを作ってみる
ここでは、ページに対する単語によるインデックスを作成する。 まずsqliteのインストールから始める。一旦、テーブル作成とインデックス設定を行う。$sudo apt-get install python-dev $sudo pip install pysqlite
#!/usr/bin/python
from pysqlite2 import dbapi2 as sqlite
import urllib3
from BeautifulSoup import *
from urlparse import urljoin
# Make ignore word set
ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
class crawler:
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
def getentryid(self, table, field, value, createnew=True):
return None
def addtoindex(self,url,soup):
print 'Indexing %s' % url
def gettextonly(self, soup):
return None
def separatewords(self, text):
return None
def isindexed(self,url):
return False
def addlinkref(self, urlFrom, urlTo, linkText):
pass
def createindextables(self):
pass
def crawl(self, pages, depth=2):
# Loop with depth
for i in range(depth):
newpages = set()
for page in pages:
# Download page's contents
try:
http = urllib3.PoolManager()
contents = http.request('GET', page).data
except:
print "Could not open %s" % page
continue
# Register index
soup = BeautifulSoup(contents)
self.addtoindex(page, soup)
# Crawl links
links = soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url = urljoin(page, link['href'])
if url.find("'") != -1: continue
url = url.split('#')[0] # delete anchor
# Add page as a new page if the page is not added
if url[0:4] == 'http' and not self.isindexed(url):
newpages.add(url)
linkText = self.gettextonly(link)
self.addlinkref(page, url, linkText)
self.dbcommit()
pages = newpages
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer, toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
if __name__ == '__main__':
pagelist=['http://kiwitobes.com/wiki/Perl.html']
crawler = crawler('searchindex.db')
crawler.createindextables()
#crawler.crawl(pagelist)
ページ内の単語を抽出
以下の手順で行う。- HTMLからタグを排除して、文字列だけ取り出す。
- [a-zA-Z0-9+]を一つの単語として、分割してリストを生成する。
- (この記事の範囲外)各単語に対してステミングを行い、語幹のみのリストを生成する。
#!/usr/bin/python
from pysqlite2 import dbapi2 as sqlite
import urllib3
from BeautifulSoup import *
from urlparse import urljoin
# Make ignore word set
ignorewords = set(['the', 'of', 'to', 'and', 'a', 'in', 'is', 'it'])
class crawler:
def __init__(self,dbname):
self.con=sqlite.connect(dbname)
def __del__(self):
self.con.close()
def dbcommit(self):
self.con.commit()
# Get id from table
def getentryid(self, table, field, value, createnew=True):
cur = self.con.execute("select rowid from %s where %s='%s'" % (table,field,value))
res = cur.fetchone()
if res == None:
cur=self.con.execute("insert into %s (%s) values ('%s')" % (table,field,value))
return cur.lastrowid
else:
return res[0]
def addtoindex(self,url,soup):
if self.isindexed(url): return
print 'Indexing %s' % url
# Get words
text = self.gettextonly(soup)
words = self.separatewords(text)
# Get URL id
urlid = self.getentryid('urllist','url',url)
# Connect words and url
for i in range(len(words)):
word = words[i]
if word in ignorewords: continue
wordid = self.getentryid('wordlist','word',word)
self.con.execute("insert into wordlocation(urlid,wordid,location) values (%d,%d,%d)" % (urlid,wordid,i))
# Delete unusuful part
def gettextonly(self, soup):
v = soup.string
if v==None:
c = soup.contents
resulttext = ''
for t in c:
subtext = self.gettextonly(t)
resulttext += subtext + '\n'
return resulttext
else:
return v.strip()
# Split words from long text
def separatewords(self, text):
splitter = re.compile('\\W*')
return [s.lower() for s in splitter.split(text) if s!='']
# Check url already registered
def isindexed(self,url):
u = self.con.execute("select rowid from urllist where url='%s'" % url).fetchone()
if u != None:
v = self.con.execute('select * from wordlocation where urlid=%d' % u[0]).fetchone()
if v != None: return True
return False
# Make relationship between pages
def addlinkref(self, urlFrom, urlTo, linkText):
words = self.separatewords(linkText)
fromid = self.getentryid('urllist', 'url', urlFrom)
toid = self.getentryid('urllist', 'url', urlTo)
if fromid == toid: return
cur =self.con.execute("insert into link(fromid, toid) values (%d,%d)" % (fromid, toid))
linkid = cur.lastrowid
for word in words:
if word in ignorewords: continue
wordid = self.getentryid('wordlist', 'word', word)
self.con.execute("insert into linkwords(linkid,wordid) values (%d,%d)" % (linkid,wordid))
def createindextables(self):
pass
def crawl(self, pages, depth=2):
# Loop with depth
for i in range(depth):
newpages = set()
for page in pages:
# Download page's contents
try:
http = urllib3.PoolManager()
contents = http.request('GET', page).data
except:
print "Could not open %s" % page
continue
# Register index
soup = BeautifulSoup(contents)
self.addtoindex(page, soup)
# Crawl links
links = soup('a')
for link in links:
if ('href' in dict(link.attrs)):
url = urljoin(page, link['href'])
if url.find("'") != -1: continue
url = url.split('#')[0] # delete anchor
# Add page as a new page if the page is not added
if url[0:4] == 'http' and not self.isindexed(url):
newpages.add(url)
linkText = self.gettextonly(link)
self.addlinkref(page, url, linkText)
self.dbcommit()
pages = newpages
def createindextables(self):
self.con.execute('create table urllist(url)')
self.con.execute('create table wordlist(word)')
self.con.execute('create table wordlocation(urlid,wordid,location)')
self.con.execute('create table link(fromid integer, toid integer)')
self.con.execute('create table linkwords(wordid,linkid)')
self.con.execute('create index wordidx on wordlist(word)')
self.con.execute('create index urlidx on urllist(url)')
self.con.execute('create index urltoidx on link(toid)')
self.con.execute('create index urlfromidx on link(fromid)')
self.dbcommit()
if __name__ == '__main__':
pagelist=['http://kiwitobes.com/wiki/Perl.html']
crawler = crawler('searchindex.db')
crawler.crawl(pagelist)
print [row for row in crawler.con.execute('select rowid from wordlocation where wordid=1')]
リファレンス
この記事は以下の本を元に知人に解説するために説明の仕方を変えて記述したものである。なお、1章と2章は公開されている。 集合知プログラミング1,2章
|
【楽天ブックスならいつでも送料無料】集合知プログラミング [ トビ-・セガラン ] 価格:3,672円(税込、送料込) |