Categories: Uncategorized

[ruby][nokogiri]Count nouns with news title using Yahoo news API and Yahoo morphological analysis API

Contents

Target

Count nouns with news titles using Yahoo news API and Yahoo morphological analysis API.

Source

Gemfile

source 'https://rubygems.org'

gem 'rake'
gem 'nokogiri'
gem 'activesupport'

count_noun.rb

# -- coding: utf-8

require "cgi"
require "open-uri"
require "rubygems"
require "nokogiri"

def get_news_titles()
    news_api_url = "http://news.yahooapis.jp/NewsWebService/V2/topics"
    news_params = {
        appid: "あなたのアプリケーションIDを入れてください",
        pickupcategory: 'all'
    }
    query = news_params.map {|k, v| "#{CGI.escape k.to_s}=#{CGI.escape v.to_s}"}.join('&')
    news_url = "#{news_api_url}?#{query}"

    doc = Nokogiri::XML open(news_url)
    titles = doc.xpath('//news:Title', {'news' => 'urn:yahoo:jp:news'}).map(&:text)
    return titles
end

def get_words(text)
    ma_api_url = "http://jlp.yahooapis.jp/MAService/V1/parse"
    ma_params = {
        appid: "あなたのアプリケーションIDを入れてください",
        sentence: text,
        results: 'ma',
        response: 'reading,surface,pos'
    }
    query = ma_params.map {|k, v| "#{CGI.escape k.to_s}=#{CGI.escape v.to_s}"}.join('&')
    ma_url = "#{ma_api_url}?#{query}"

    doc = Nokogiri::XML open(ma_url)
    poss = doc.xpath('//jlp:pos', {'jlp' => 'urn:yahoo:jp:jlp'}).map(&:text)
    words = doc.xpath('//jlp:surface', {'jlp' => 'urn:yahoo:jp:jlp'}).map(&:text)
    return words.zip(poss)
end

def add_noun_to_hash(wordset, words)
    words.each {|word, pos|
        if pos == "名詞"
            if wordset.key?(word)
                wordset[word] += 1
            else
                wordset[word] = 1
            end
        end
    }
end

wordset = Hash::new

get_news_titles.each{|title|
    add_noun_to_hash(wordset, get_words(title))
}

wordset.each_pair {|key,value| puts "#{key}:\t#{value}"}

Result

福島:   1
原発:   1
事故:   1
全員:   1
不起訴: 1
被爆:   1
1:      1
長崎:   1
写真:   1
発見:   1
韓国:   1
美容:   1
整形手術:       1
増税:   1
事前:   1
運動:   1
自民:   1
秘書:   1
書類送検:       1
駅:     1
エスカレーター: 1
歩行:   1
ダメ:   1
北朝鮮: 1
携帯:   1
電話:   1
規制:   1
強化:   1
歌舞伎町:       1
暴走:   1
23:     1
摘発:   1
肉のハナマサ:   1
買収:   1
9:      1
めど:   1
乗り:   1
正念場: 1
中国:   1
経済:   1
甲子園: 1
米:     1
注目:   1
zuqqhi2