From 2a610b6866deea4c8872ec882dff4e891e8b9f48 Mon Sep 17 00:00:00 2001 From: Rafael Weinstein Date: Thu, 23 Jul 2015 13:07:44 -0700 Subject: [PATCH] Add fetch-urls script --- clients/pitchmap/.gitignore | 1 + clients/pitchmap/README.md | 13 +++++++----- clients/pitchmap/fetch-urls.js | 36 ++++++++++++++++++++++++++++++++++ clients/pitchmap/package.json | 7 +++++++ 4 files changed, 52 insertions(+), 5 deletions(-) create mode 100644 clients/pitchmap/.gitignore create mode 100644 clients/pitchmap/fetch-urls.js create mode 100644 clients/pitchmap/package.json diff --git a/clients/pitchmap/.gitignore b/clients/pitchmap/.gitignore new file mode 100644 index 0000000000..3c3629e647 --- /dev/null +++ b/clients/pitchmap/.gitignore @@ -0,0 +1 @@ +node_modules diff --git a/clients/pitchmap/README.md b/clients/pitchmap/README.md index 992c174ad2..1b5722150c 100644 --- a/clients/pitchmap/README.md +++ b/clients/pitchmap/README.md @@ -9,13 +9,16 @@ http://gd2.mlb.com/components/game/mlb/ To use: ``` -cd /tmp/foo -wget -e robots=off -A "[0-9]*.xml" -r -l1 \ -http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_12/gid_2015_05_12_atlmlb_cinmlb_1/pitchers/ +npm install -wget -e robots=off -A "inning_[0-9]*.xml" -r -l1 \ -http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_12/gid_2015_05_12_atlmlb_cinmlb_1/inning/ +mkdir /tmp/mlb_data + +node fetch-urls.js --url="http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_07/" --print="gid.*pitcher.*xml$|gid.*inning_[0-9]*\.xml" --reject="\/year_[0-9]+\/pitchers\/|\/year_[0-9]+\/mobile\/|\/year_[0-9]+\/media\/|\/year_[0-9]+\/batters\/|\/premium\/|\/notifications\/|\/pitching_staff\/|\/media\/|\/batters\/|\/[^\/]+\.[^\/]+$" > /tmp/mlb_data/urls.txt + +cd /tmp/mlb_data + +wget -i urls.txt /clients/xml_importer/xml_importer --file-store=/tmp/mlb_data --dataset-id=mlb/xml gd2.mlb.com/ diff --git a/clients/pitchmap/fetch-urls.js b/clients/pitchmap/fetch-urls.js new file mode 100644 index 0000000000..6ba5bd9415 --- /dev/null +++ b/clients/pitchmap/fetch-urls.js @@ -0,0 +1,36 @@ +var argv = require( 'argv' ); +var args = argv.option([ + { + name: 'url', + type: 'string', + description: 'URL to start fetching from' + }, + { + name: 'print', + type: 'string', + description: 'print urls found in files which match this regexp' + }, + { + name: 'reject', + type: 'string', + description: 'dont follow urls which match this regexp' + } +]).run(); + +var rootURL = args.options.url +var printRegexp = new RegExp(args.options.print); +var rejectRegexp = new RegExp(args.options.reject); + +var crawler = require("simplecrawler").crawl(rootURL); +crawler.maxDepth = 3; +crawler.addFetchCondition(function(url) { + var url = url.protocol + '://' + url.host + url.uriPath; + var print = !!url.match(printRegexp); + var reject = url.indexOf(rootURL) < 0 || !!url.match(rejectRegexp); + + if (print) { + console.log(url); + } + + return !reject; +}); diff --git a/clients/pitchmap/package.json b/clients/pitchmap/package.json new file mode 100644 index 0000000000..3ee9edf8be --- /dev/null +++ b/clients/pitchmap/package.json @@ -0,0 +1,7 @@ +{ + "name": "fetch-urls", + "dependencies": { + "argv": "*", + "simplecrawler": "*" + } +}