JS MLB pitch-indexer (#1680)

This is a re-implementation of our old 'pitchmap' indexer. It takes in
MLB game data that was previously imported from XML files available on
mlb.com and processes it to extract per-pitcher pitch data.

It feels slow, but it's a starting off point to improve later for fun.
This commit is contained in:
cmasone-attic
2016-05-31 21:06:45 -07:00
parent 3429d10b70
commit 7887422c36
6 changed files with 231 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
../../../js/.babelrc
+3
View File
@@ -0,0 +1,3 @@
module.exports = require('@attic/eslintrc');
// Allow console
module.exports.rules['no-console'] = 0;
+1
View File
@@ -0,0 +1 @@
../../../js/.flowconfig
+2
View File
@@ -0,0 +1,2 @@
node_modules
dist
+38
View File
@@ -0,0 +1,38 @@
{
"name": "@attic/pitch-index",
"main": "dist/main.js",
"version": "1.0.1",
"description": "noms example javascript thing",
"dependencies": {
"@attic/noms": "file:../../../js",
"babel-regenerator-runtime": "6.5.0"
},
"devDependencies": {
"@attic/eslintrc": "^1.0.0",
"babel-cli": "6.6.5",
"babel-core": "6.7.2",
"babel-generator": "6.7.2",
"babel-plugin-syntax-async-functions": "6.5.0",
"babel-plugin-syntax-flow": "6.5.0",
"babel-plugin-transform-async-to-generator": "6.7.0",
"babel-plugin-transform-class-properties": "6.6.0",
"babel-plugin-transform-es2015-destructuring": "6.6.5",
"babel-plugin-transform-es2015-modules-commonjs": "6.7.0",
"babel-plugin-transform-es2015-parameters": "6.7.0",
"babel-plugin-transform-runtime": "^6.6.0",
"babel-preset-es2015": "6.6.0",
"babel-preset-react": "6.5.0",
"flow-bin": "^0.25.0",
"chai": "3.5.0",
"mocha": "2.4.5"
},
"scripts": {
"start": "babel -d dist -w src",
"build": "BABEL_ENV=production babel -d dist src",
"test": "eslint src/ && flow src/"
},
"keywords": [
"noms",
"example"
]
}
+186
View File
@@ -0,0 +1,186 @@
// Copyright 2016 The Noms Authors. All rights reserved.
// Licensed under the Apache License, version 2.0:
// http://www.apache.org/licenses/LICENSE-2.0
// @flow
import argv from 'yargs';
import {
DatasetSpec,
invariant,
List,
Map as NomsMap,
Ref,
Struct,
newStruct,
} from '@attic/noms';
import type {Value} from '@attic/noms';
const args = argv
.usage('Usage: $0 <input-dataset> <output-dataset>')
.demand(2, 'You must provide both a dataset to read from, and one to write to.')
.argv;
main().catch(ex => {
console.error(ex.stack);
process.exit(1);
});
type XMLElement = NomsMap<string, NomsMap<string, any>>;
async function main(): Promise<void> {
const inSpec = DatasetSpec.parse(args._[0]);
invariant(inSpec, quit('invalid input dataset spec'));
const outSpec = DatasetSpec.parse(args._[1]);
invariant(outSpec, quit('invalid input dataset spec'));
const input = inSpec.dataset();
const commit = await input.head();
const head = commit && commit.value;
invariant(head, quit(`{args._[0]} does not exist}`));
const pitchers = new Map();
const inningPs = [];
const playerPs = [];
await head.forEach((ref: Ref<XMLElement>) => {
// We force elemP to be 'any' here because the 'inning' entry and the 'Player' entry have
// different types that involve multiple levels of nested maps OR strings.
const elemP: any = ref.targetValue(input.database);
inningPs.push(maybeProcessInning(elemP));
playerPs.push(maybeProcessPitcher(elemP, pitchers));
});
await Promise.all(playerPs);
const pitcherPitches = new Map();
for (const m of await Promise.all(inningPs)) {
if (m) {
for (const [pitcherID, pitches] of m) {
const pitcher = pitchers.get(pitcherID);
pitcherPitches.set(pitcher, extendArray(pitches, pitcherPitches.get(pitcher)));
}
}
}
const mapData = [];
for (const [pitcher, pitches] of pitcherPitches) {
mapData.push([pitcher, new List(pitches)]);
}
await outSpec.dataset().commit(new NomsMap(mapData));
}
async function maybeProcessPitcher(ep: Promise<XMLEntity>, pitchers: Map<string, string>):
Promise<void> {
const player = await (await ep).get('Player');
if (player) {
const [id, first, last] = await Promise.all([
player.get('-id'), player.get('-first_name'), player.get('-last_name')]);
pitchers.set(id, last + ', ' + first);
}
}
type PitcherPitches = Map<string, Array<Struct>>;
function mergeInto(a: PitcherPitches, b: ?PitcherPitches) {
if (!b) {
return a;
}
for (const [pitcher, pitches] of b) {
a.set(pitcher, extendArray(pitches, a.get(pitcher)));
}
}
function maybeProcessInning(ep: Promise<XMLElement>): Promise<?Map<string, Array<Struct>>> {
return ep.then(elem => elem.get('inning')).then(inn => inn && processInning(inn));
}
function processInning(inning: NomsMap<string, NomsMap>): Promise<Map<string, Array<Struct>>> {
return Promise.all([inning.get('top'), inning.get('bottom')])
.then(halves => {
const halfPs = [];
for (const half of halves) {
if (half) {
halfPs.push(half.get('atbat'));
}
}
return Promise.all(halfPs);
})
.then(abData => {
const abPs = [];
for (const abs of abData) {
abPs.push(processAbs(normalize(abs)));
}
return Promise.all(abPs);
})
.then(pitcherPitchList => {
const ret = new Map();
for (const pitcherPitches of pitcherPitchList) {
mergeInto(ret, pitcherPitches);
}
return ret;
});
}
function processAbs(abs: List): Promise<PitcherPitches> {
const ps = [];
return abs.forEach(ab => {
ps.push(
Promise.all([ab.get('-pitcher'), ab.get('pitch')])
.then(([pitcher, d]) => Promise.all([pitcher, processPitches(normalize(d))]))
);
})
.then(() => Promise.all(ps))
.then(abdata => {
const pitchCounts = new Map();
for (const [pitcher, pitches] of abdata) {
if (pitches.length > 0) {
pitchCounts.set(pitcher, extendArray(pitchCounts.get(pitcher), pitches));
}
}
return pitchCounts;
});
}
function extendArray<T>(a: Array<T> = [], b: Array<T> = []): Array<T> {
b.forEach(e => a.push(e));
return a;
}
function normalize<T: Value>(d: ?T | List<T>): List<T> {
if (!d) {
return new List();
}
if (d instanceof List) {
return d;
}
return new List([d]);
}
type PitchData = NomsMap<string, string>;
function processPitches(d: List<PitchData>): Promise<Array<Struct>> {
const pitchPs = [];
return d.forEach((p: PitchData) => {
pitchPs.push(getPitch(p));
})
.then(() => pitchPs)
.then(pitchPs => Promise.all(pitchPs))
.then(pitches => pitches.filter((e: ?Struct): boolean => !!e));
}
function getPitch(p: PitchData): Promise<?Struct> {
return Promise.all([p.get('-px'), p.get('-pz')]).then(([xStr, zStr]) => {
if (!xStr || !zStr) {
return;
}
const [x, z] = [Number(xStr), Number(zStr)];
invariant(!isNaN(x), x + ' should be a number');
invariant(!isNaN(z), z + ' should be a number');
return newStruct('Pitch', {x, z});
});
}
function quit(err: string): () => void {
return () => {
process.stderr.write(err + '\n');
process.exit(1);
};
}