#!/usr/bin/env node //=============================================== // Load Libraries var cheerio = require("cheerio"); var request = require("request"); var yaml = require("yaml"); var fs = require("fs"); var command = require("commander"); var mongoose = require("mongoose"); var log4js = require('log4js'); var jquery = require('jquery'); //=============================================== // Logging Setting log4js.configure({ appenders: [{ "type": "dateFile", "filename": "./logs/insert_tabelog_datas.log", "pattern": "-yyyy-MM-dd" }] }); var logger = log4js.getLogger("dateFile"); //=============================================== // Load config file var configData = fs.readFileSync("./config/config.yml","utf8"); var config = yaml.eval(configData); //=============================================== // Make request URL var requestBaseUrl = config.tabelog_api.base_url; var apiKey = config.tabelog_api.api_key; // paramter // parse command options and create help command automatically command .version('1.0.0') .usage('[option]') .option('-p, --prefecture <String>', 'prefecture name (default japan)') .option('-n, --pagenum <n>', 'page number (max 60)', parseInt) .parse(process.argv); var pageNumber = 1; var prefecture = "japan"; if (command.prefecture) prefecture = command.prefecture; if (command.pagenum) pageNumber = command.pagenum; var queryParams = new Array(); queryParams.push("Prefecture=" + prefecture); queryParams.push("PageNum=" + pageNumber); queryParams.push("ResultSet=large"); queryParams.push("Key=" + apiKey); // join var requestUrl = requestBaseUrl + "?" + queryParams.join("&"); //=============================================== // DB setting var db = mongoose.connect('mongodb://' + config.event_db.host + '/' + config.event_db.db_name function (err) { if (err) { logger.error("Connection Fail. mongodb://" + config.event_db.host + "/" + config.event_db.db_name); } else { logger.info("Connection Success!"); } } ); var InsertTabelogDataConfigSchema = new mongoose.Schema({ cur_prefecture : { type: Number, default: 0 }, cur_pagenum : Number, max_pagenum : Number, created_time : { type: Date, default: Date.now }, update_time : { type: Date, default: Date.now } }); var DBSchema = new mongoose.Schema({ event_id : { type: Number, default: 0 }, genre_id : Number, title : String, image : String, description : String, url : String, station : String, address : String, business_hour: String, holiday : String, latitude : Number, longitude : Number, created_time : { type: Date, default: Date.now }, update_time : { type: Date, default: Date.now } }); DBSchema.pre('save', function(next) { if(!this.isNew) return next(); var model = this; model.db.db.executeDbCommand({ findAndModify: 'current_event_id', // 'コマンド名': '対象のコレクション名' query: { name: model.collection.name }, // 検索オプション update: { $set: { name: model.collection.name }, $inc: { sequence: 1 } }, new: true, // 更新したデータを受け取るかどうか upsert: true // 見つからなかったら挿入するかどうか }, function(err, data) { if(!err && data.documents[0].ok) { // model.id に取得した値をセット model.event_id = data.documents[0].value.sequence; next(); } else { next(err || new Error(data.documents[0].errmsg)); } }); }); var DBclass = db.model("sample_db", DBSchema); //=============================================== // Get xml data about restaurant from tabelogAPI function fetch_tabelogAPI_response(api_request_url) { var deferred = jquery.Deferred(); request({url: api_request_url}, function(error, response, body) { if (!error && response.statusCode == 200) { logger.info("response statusCode : " + response.statusCode); $ = cheerio.load(body, {ignoreWhitespace: true, xmlMode: true}); var url = response.request.href; var latest_id = 0; return deferred.resolve($); } else { deferred.reject(new Error(error)); } }); return deferred.promise(); } //=============================================== // Parse response data from tabelogAPI function parse_tabelogAPI_response(response) { var restaurant_data = new Array(); response("Item").each(function(i, xmlItem) { var rcd = $(xmlItem).children()[0]["children"][0]["data"]; var data = {}; data["genre_id"] = 1; data["title"] = $(xmlItem).children()[1]["children"][0]["data"]; data["image"] = ""; data["description"] = ""; data["url"] = $(xmlItem).children()[2]["children"][0]["data"]; data["station"] = $(xmlItem).children()[12]["children"][0]["data"]; data["address"] = $(xmlItem).children()[13]["children"][0]["data"]; data["business_hour"] = $(xmlItem).children()[15]["children"][0]["data"]; data["holiday"] = $(xmlItem).children()[16]["children"][0]["data"]; data["latitude"] = parseFloat($(xmlItem).children()[17]["children"][0]["data"]); data["longitude"] = parseFloat($(xmlItem).children()[18]["children"][0]["data"]); data["rcd"] = rcd; restaurant_data.push(data); logger.info("Retrieve data: " + i); logger.info(data); logger.info("Rcd : " + rcd); }); return restaurant_data; } //=============================================== // Get image URL from tabelogAPI var image_api_base_url = "http://api.tabelog.com/Ver1/ReviewImageSearch/?Key=" + apiKey + "&Rcd="; function fetch_restaurant_image_url(data) { var deferred = jquery.Deferred(); var arg = new Array(2); arg[0] = 0; arg[1] = data; var result = parse_tabelog_imageAPI_response(arg); for (var idx = 1; idx < data.length; idx++) { result = result.then(parse_tabelog_imageAPI_response); } result.then(function (arg) { return deferred.resolve(arg[1]); }); return deferred.promise(); } function parse_tabelog_imageAPI_response(data) { var deferred = jquery.Deferred(); var idx = data[0]; var info = data[1]; var image_api_url = image_api_base_url + info[idx]["rcd"]; request({url: image_api_url}, function (error, response, body) { if (!error && response.statusCode == 200) { logger.info("response statusCode : " + response.statusCode); $ = cheerio.load(body, {ignoreWhitespace: true, xmlMode: true}); info[idx]["image"] = $("Item").children()[2]["children"][0]["data"]; logger.info("image url of rcd " + info[idx]["rcd"] + " : " + info[idx]["image"]); var result = new Array(2); result[0] = idx + 1; result[1] = info; return deferred.resolve(result); } else { return deferred.reject(new Error(error)); } }); return deferred.promise(); } //=============================================== // Get restaurant description: function fetch_restaurant_description(data) { var deferred = jquery.Deferred(); var arg = new Array(2); arg[0] = 0; arg[1] = data; var result = parse_tabelog_description_response(arg); for (var idx = 1; idx < data.length; idx++) { result = result.then(parse_tabelog_description_response); } result.then(function (arg) { return deferred.resolve(arg[1]); }); return deferred.promise(); } function parse_tabelog_description_response(data) { var deferred = jquery.Deferred(); var idx = data[0]; var info = data[1]; request({url: info[idx]["url"]}, function (error, response, body) { if (!error && response.statusCode == 200) { logger.info("response statusCode : " + response.statusCode); $ = cheerio.load(body, {ignoreWhitespace: true, xmlMode: true}); info[idx]["description"] = $("p.comment").first().text(); var result = new Array(2); result[0] = idx + 1; result[1] = info; return deferred.resolve(result); } else { return deferred.reject(new Error(error)); } }); return deferred.promise(); } //=============================================== // Delete unuseful field function delete_unuseful_field(data) { var deferred = jquery.Deferred(); for (var idx = 0; idx < data.length; idx++) { delete data[idx]["rcd"]; } return deferred.resolve(data); } //=============================================== // Insert data for mongo function insert_restaurant_data(data) { var deferred = jquery.Deferred(); for (var idx = 0; idx < data.length; idx++) { var newPost = new DBclass(data[idx]); newPost.save(function(err) { if (err) { logger.error("insert error :" + data[idx]); } else { logger.info("insert is successful with " + data[idx]); } }); } return deferred.resolve(data); } //=============================================== // [Main] Analyze response logger.info("request url: " + requestUrl); // Fetch restaurant data fetch_tabelogAPI_response(requestUrl) // Parse the data .then(parse_tabelogAPI_response) // Get image URL from tabeloAPI .then(fetch_restaurant_image_url) // Get description from restaurant page .then(fetch_restaurant_description) // Delete unuseful field of data .then(delete_unuseful_field) // Insert retrieval data to mongo .then(insert_restaurant_data) // output .then(function(data) { console.log(data); });
Streamlit is a …
I bought M5Stac…