mirror of
https://github.com/dolthub/dolt.git
synced 2026-05-24 02:43:42 -05:00
JS MLB pitch-indexer (#1680)
This is a re-implementation of our old 'pitchmap' indexer. It takes in MLB game data that was previously imported from XML files available on mlb.com and processes it to extract per-pitcher pitch data. It feels slow, but it's a starting off point to improve later for fun.
This commit is contained in:
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../../js/.babelrc
|
||||
@@ -0,0 +1,3 @@
|
||||
module.exports = require('@attic/eslintrc');
|
||||
// Allow console
|
||||
module.exports.rules['no-console'] = 0;
|
||||
+1
@@ -0,0 +1 @@
|
||||
../../../js/.flowconfig
|
||||
@@ -0,0 +1,2 @@
|
||||
node_modules
|
||||
dist
|
||||
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"name": "@attic/pitch-index",
|
||||
"main": "dist/main.js",
|
||||
"version": "1.0.1",
|
||||
"description": "noms example javascript thing",
|
||||
"dependencies": {
|
||||
"@attic/noms": "file:../../../js",
|
||||
"babel-regenerator-runtime": "6.5.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@attic/eslintrc": "^1.0.0",
|
||||
"babel-cli": "6.6.5",
|
||||
"babel-core": "6.7.2",
|
||||
"babel-generator": "6.7.2",
|
||||
"babel-plugin-syntax-async-functions": "6.5.0",
|
||||
"babel-plugin-syntax-flow": "6.5.0",
|
||||
"babel-plugin-transform-async-to-generator": "6.7.0",
|
||||
"babel-plugin-transform-class-properties": "6.6.0",
|
||||
"babel-plugin-transform-es2015-destructuring": "6.6.5",
|
||||
"babel-plugin-transform-es2015-modules-commonjs": "6.7.0",
|
||||
"babel-plugin-transform-es2015-parameters": "6.7.0",
|
||||
"babel-plugin-transform-runtime": "^6.6.0",
|
||||
"babel-preset-es2015": "6.6.0",
|
||||
"babel-preset-react": "6.5.0",
|
||||
"flow-bin": "^0.25.0",
|
||||
"chai": "3.5.0",
|
||||
"mocha": "2.4.5"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "babel -d dist -w src",
|
||||
"build": "BABEL_ENV=production babel -d dist src",
|
||||
"test": "eslint src/ && flow src/"
|
||||
},
|
||||
"keywords": [
|
||||
"noms",
|
||||
"example"
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,186 @@
|
||||
// Copyright 2016 The Noms Authors. All rights reserved.
|
||||
// Licensed under the Apache License, version 2.0:
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
// @flow
|
||||
|
||||
import argv from 'yargs';
|
||||
import {
|
||||
DatasetSpec,
|
||||
invariant,
|
||||
List,
|
||||
Map as NomsMap,
|
||||
Ref,
|
||||
Struct,
|
||||
newStruct,
|
||||
} from '@attic/noms';
|
||||
import type {Value} from '@attic/noms';
|
||||
|
||||
const args = argv
|
||||
.usage('Usage: $0 <input-dataset> <output-dataset>')
|
||||
.demand(2, 'You must provide both a dataset to read from, and one to write to.')
|
||||
.argv;
|
||||
|
||||
main().catch(ex => {
|
||||
console.error(ex.stack);
|
||||
process.exit(1);
|
||||
});
|
||||
|
||||
type XMLElement = NomsMap<string, NomsMap<string, any>>;
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const inSpec = DatasetSpec.parse(args._[0]);
|
||||
invariant(inSpec, quit('invalid input dataset spec'));
|
||||
const outSpec = DatasetSpec.parse(args._[1]);
|
||||
invariant(outSpec, quit('invalid input dataset spec'));
|
||||
|
||||
const input = inSpec.dataset();
|
||||
const commit = await input.head();
|
||||
const head = commit && commit.value;
|
||||
invariant(head, quit(`{args._[0]} does not exist}`));
|
||||
|
||||
const pitchers = new Map();
|
||||
const inningPs = [];
|
||||
const playerPs = [];
|
||||
await head.forEach((ref: Ref<XMLElement>) => {
|
||||
// We force elemP to be 'any' here because the 'inning' entry and the 'Player' entry have
|
||||
// different types that involve multiple levels of nested maps OR strings.
|
||||
const elemP: any = ref.targetValue(input.database);
|
||||
inningPs.push(maybeProcessInning(elemP));
|
||||
playerPs.push(maybeProcessPitcher(elemP, pitchers));
|
||||
});
|
||||
|
||||
await Promise.all(playerPs);
|
||||
const pitcherPitches = new Map();
|
||||
for (const m of await Promise.all(inningPs)) {
|
||||
if (m) {
|
||||
for (const [pitcherID, pitches] of m) {
|
||||
const pitcher = pitchers.get(pitcherID);
|
||||
pitcherPitches.set(pitcher, extendArray(pitches, pitcherPitches.get(pitcher)));
|
||||
}
|
||||
}
|
||||
}
|
||||
const mapData = [];
|
||||
for (const [pitcher, pitches] of pitcherPitches) {
|
||||
mapData.push([pitcher, new List(pitches)]);
|
||||
}
|
||||
await outSpec.dataset().commit(new NomsMap(mapData));
|
||||
}
|
||||
|
||||
async function maybeProcessPitcher(ep: Promise<XMLEntity>, pitchers: Map<string, string>):
|
||||
Promise<void> {
|
||||
const player = await (await ep).get('Player');
|
||||
if (player) {
|
||||
const [id, first, last] = await Promise.all([
|
||||
player.get('-id'), player.get('-first_name'), player.get('-last_name')]);
|
||||
pitchers.set(id, last + ', ' + first);
|
||||
}
|
||||
}
|
||||
|
||||
type PitcherPitches = Map<string, Array<Struct>>;
|
||||
|
||||
function mergeInto(a: PitcherPitches, b: ?PitcherPitches) {
|
||||
if (!b) {
|
||||
return a;
|
||||
}
|
||||
for (const [pitcher, pitches] of b) {
|
||||
a.set(pitcher, extendArray(pitches, a.get(pitcher)));
|
||||
}
|
||||
}
|
||||
|
||||
function maybeProcessInning(ep: Promise<XMLElement>): Promise<?Map<string, Array<Struct>>> {
|
||||
return ep.then(elem => elem.get('inning')).then(inn => inn && processInning(inn));
|
||||
}
|
||||
|
||||
function processInning(inning: NomsMap<string, NomsMap>): Promise<Map<string, Array<Struct>>> {
|
||||
return Promise.all([inning.get('top'), inning.get('bottom')])
|
||||
.then(halves => {
|
||||
const halfPs = [];
|
||||
for (const half of halves) {
|
||||
if (half) {
|
||||
halfPs.push(half.get('atbat'));
|
||||
}
|
||||
}
|
||||
return Promise.all(halfPs);
|
||||
})
|
||||
.then(abData => {
|
||||
const abPs = [];
|
||||
for (const abs of abData) {
|
||||
abPs.push(processAbs(normalize(abs)));
|
||||
}
|
||||
return Promise.all(abPs);
|
||||
})
|
||||
.then(pitcherPitchList => {
|
||||
const ret = new Map();
|
||||
for (const pitcherPitches of pitcherPitchList) {
|
||||
mergeInto(ret, pitcherPitches);
|
||||
}
|
||||
return ret;
|
||||
});
|
||||
}
|
||||
|
||||
function processAbs(abs: List): Promise<PitcherPitches> {
|
||||
const ps = [];
|
||||
return abs.forEach(ab => {
|
||||
ps.push(
|
||||
Promise.all([ab.get('-pitcher'), ab.get('pitch')])
|
||||
.then(([pitcher, d]) => Promise.all([pitcher, processPitches(normalize(d))]))
|
||||
);
|
||||
})
|
||||
.then(() => Promise.all(ps))
|
||||
.then(abdata => {
|
||||
const pitchCounts = new Map();
|
||||
for (const [pitcher, pitches] of abdata) {
|
||||
if (pitches.length > 0) {
|
||||
pitchCounts.set(pitcher, extendArray(pitchCounts.get(pitcher), pitches));
|
||||
}
|
||||
}
|
||||
return pitchCounts;
|
||||
});
|
||||
}
|
||||
|
||||
function extendArray<T>(a: Array<T> = [], b: Array<T> = []): Array<T> {
|
||||
b.forEach(e => a.push(e));
|
||||
return a;
|
||||
}
|
||||
|
||||
function normalize<T: Value>(d: ?T | List<T>): List<T> {
|
||||
if (!d) {
|
||||
return new List();
|
||||
}
|
||||
if (d instanceof List) {
|
||||
return d;
|
||||
}
|
||||
return new List([d]);
|
||||
}
|
||||
|
||||
type PitchData = NomsMap<string, string>;
|
||||
|
||||
function processPitches(d: List<PitchData>): Promise<Array<Struct>> {
|
||||
const pitchPs = [];
|
||||
return d.forEach((p: PitchData) => {
|
||||
pitchPs.push(getPitch(p));
|
||||
})
|
||||
.then(() => pitchPs)
|
||||
.then(pitchPs => Promise.all(pitchPs))
|
||||
.then(pitches => pitches.filter((e: ?Struct): boolean => !!e));
|
||||
}
|
||||
|
||||
function getPitch(p: PitchData): Promise<?Struct> {
|
||||
return Promise.all([p.get('-px'), p.get('-pz')]).then(([xStr, zStr]) => {
|
||||
if (!xStr || !zStr) {
|
||||
return;
|
||||
}
|
||||
const [x, z] = [Number(xStr), Number(zStr)];
|
||||
invariant(!isNaN(x), x + ' should be a number');
|
||||
invariant(!isNaN(z), z + ' should be a number');
|
||||
return newStruct('Pitch', {x, z});
|
||||
});
|
||||
}
|
||||
|
||||
function quit(err: string): () => void {
|
||||
return () => {
|
||||
process.stderr.write(err + '\n');
|
||||
process.exit(1);
|
||||
};
|
||||
}
|
||||
Reference in New Issue
Block a user