Add fetch-urls script

This commit is contained in:
Rafael Weinstein
2015-07-23 13:07:44 -07:00
parent f229396931
commit 2a610b6866
4 changed files with 52 additions and 5 deletions

1
clients/pitchmap/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
node_modules

View File

@@ -9,13 +9,16 @@ http://gd2.mlb.com/components/game/mlb/
To use:
```
cd /tmp/foo
wget -e robots=off -A "[0-9]*.xml" -r -l1 \
http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_12/gid_2015_05_12_atlmlb_cinmlb_1/pitchers/
npm install
wget -e robots=off -A "inning_[0-9]*.xml" -r -l1 \
http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_12/gid_2015_05_12_atlmlb_cinmlb_1/inning/
mkdir /tmp/mlb_data
node fetch-urls.js --url="http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_07/" --print="gid.*pitcher.*xml$|gid.*inning_[0-9]*\.xml" --reject="\/year_[0-9]+\/pitchers\/|\/year_[0-9]+\/mobile\/|\/year_[0-9]+\/media\/|\/year_[0-9]+\/batters\/|\/premium\/|\/notifications\/|\/pitching_staff\/|\/media\/|\/batters\/|\/[^\/]+\.[^\/]+$" > /tmp/mlb_data/urls.txt
cd /tmp/mlb_data
wget -i urls.txt
<noms>/clients/xml_importer/xml_importer --file-store=/tmp/mlb_data --dataset-id=mlb/xml gd2.mlb.com/

View File

@@ -0,0 +1,36 @@
var argv = require( 'argv' );
var args = argv.option([
{
name: 'url',
type: 'string',
description: 'URL to start fetching from'
},
{
name: 'print',
type: 'string',
description: 'print urls found in files which match this regexp'
},
{
name: 'reject',
type: 'string',
description: 'dont follow urls which match this regexp'
}
]).run();
var rootURL = args.options.url
var printRegexp = new RegExp(args.options.print);
var rejectRegexp = new RegExp(args.options.reject);
var crawler = require("simplecrawler").crawl(rootURL);
crawler.maxDepth = 3;
crawler.addFetchCondition(function(url) {
var url = url.protocol + '://' + url.host + url.uriPath;
var print = !!url.match(printRegexp);
var reject = url.indexOf(rootURL) < 0 || !!url.match(rejectRegexp);
if (print) {
console.log(url);
}
return !reject;
});

View File

@@ -0,0 +1,7 @@
{
"name": "fetch-urls",
"dependencies": {
"argv": "*",
"simplecrawler": "*"
}
}