mirror of
https://github.com/dolthub/dolt.git
synced 2026-01-30 19:09:34 -06:00
Add fetch-urls script
This commit is contained in:
1
clients/pitchmap/.gitignore
vendored
Normal file
1
clients/pitchmap/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
node_modules
|
||||
@@ -9,13 +9,16 @@ http://gd2.mlb.com/components/game/mlb/
|
||||
To use:
|
||||
|
||||
```
|
||||
cd /tmp/foo
|
||||
|
||||
wget -e robots=off -A "[0-9]*.xml" -r -l1 \
|
||||
http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_12/gid_2015_05_12_atlmlb_cinmlb_1/pitchers/
|
||||
npm install
|
||||
|
||||
wget -e robots=off -A "inning_[0-9]*.xml" -r -l1 \
|
||||
http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_12/gid_2015_05_12_atlmlb_cinmlb_1/inning/
|
||||
mkdir /tmp/mlb_data
|
||||
|
||||
node fetch-urls.js --url="http://gd2.mlb.com/components/game/mlb/year_2015/month_05/day_07/" --print="gid.*pitcher.*xml$|gid.*inning_[0-9]*\.xml" --reject="\/year_[0-9]+\/pitchers\/|\/year_[0-9]+\/mobile\/|\/year_[0-9]+\/media\/|\/year_[0-9]+\/batters\/|\/premium\/|\/notifications\/|\/pitching_staff\/|\/media\/|\/batters\/|\/[^\/]+\.[^\/]+$" > /tmp/mlb_data/urls.txt
|
||||
|
||||
cd /tmp/mlb_data
|
||||
|
||||
wget -i urls.txt
|
||||
|
||||
<noms>/clients/xml_importer/xml_importer --file-store=/tmp/mlb_data --dataset-id=mlb/xml gd2.mlb.com/
|
||||
|
||||
|
||||
36
clients/pitchmap/fetch-urls.js
Normal file
36
clients/pitchmap/fetch-urls.js
Normal file
@@ -0,0 +1,36 @@
|
||||
var argv = require( 'argv' );
|
||||
var args = argv.option([
|
||||
{
|
||||
name: 'url',
|
||||
type: 'string',
|
||||
description: 'URL to start fetching from'
|
||||
},
|
||||
{
|
||||
name: 'print',
|
||||
type: 'string',
|
||||
description: 'print urls found in files which match this regexp'
|
||||
},
|
||||
{
|
||||
name: 'reject',
|
||||
type: 'string',
|
||||
description: 'dont follow urls which match this regexp'
|
||||
}
|
||||
]).run();
|
||||
|
||||
var rootURL = args.options.url
|
||||
var printRegexp = new RegExp(args.options.print);
|
||||
var rejectRegexp = new RegExp(args.options.reject);
|
||||
|
||||
var crawler = require("simplecrawler").crawl(rootURL);
|
||||
crawler.maxDepth = 3;
|
||||
crawler.addFetchCondition(function(url) {
|
||||
var url = url.protocol + '://' + url.host + url.uriPath;
|
||||
var print = !!url.match(printRegexp);
|
||||
var reject = url.indexOf(rootURL) < 0 || !!url.match(rejectRegexp);
|
||||
|
||||
if (print) {
|
||||
console.log(url);
|
||||
}
|
||||
|
||||
return !reject;
|
||||
});
|
||||
7
clients/pitchmap/package.json
Normal file
7
clients/pitchmap/package.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"name": "fetch-urls",
|
||||
"dependencies": {
|
||||
"argv": "*",
|
||||
"simplecrawler": "*"
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user