Clean up usernames from import and limit to top 30

This commit is contained in:
Dan Willhite
2017-08-25 11:18:01 -07:00
committed by Dan Willhite
parent f23cbe5344
commit cc42196818
2 changed files with 31 additions and 9 deletions

View File

@@ -72,21 +72,17 @@ func runImport(dir, dsSpec string) error {
userpat := regexp.MustCompile(`^[a-zA-Z][a-zA-Z\s]*\d*$`)
fmt.Println("Creating users")
usermap := map[string]struct{}{}
usermap := map[string]int{}
outer:
for _, msg := range msgs {
name := strings.TrimSpace(msg.Author)
if !userpat.MatchString(name) {
continue outer
}
usermap[name] = struct{}{}
usermap[name] += 1
}
users := []string{}
for k, _ := range usermap {
users = append(users, k)
}
sort.Strings(users)
users := topUsers(usermap)
fmt.Println("Committing data")
root := Root{Messages: m, Index: termDocs, Users: users}
_, err = ds.Database().CommitValue(ds, marshal.MustMarshal(root))
@@ -127,3 +123,30 @@ func characterName(n *html.Node) string {
}
return strings.TrimSpace(n.FirstChild.Data)
}
type cpair struct {
character string
cnt int
}
func topUsers(usermap map[string]int) []string {
pairs := []cpair{}
for name, cnt := range usermap {
if len(name) > 1 && !strings.HasPrefix(name, "ANOTHER") {
pairs = append(pairs, cpair{character: strings.ToLower(name), cnt: cnt})
}
}
// sort descending by cnt
sort.Slice(pairs, func(i, j int) bool {
return pairs[j].cnt < pairs[i].cnt
})
users := []string{}
for i, p := range pairs {
if i >= 30 {
break
}
users = append(users, p.character)
}
sort.Strings(users)
return users
}

View File

@@ -8,8 +8,6 @@ import (
"context"
"encoding/base64"
floodsub "gx/ipfs/QmZdsQf8BiCpAj61nz9NgqVeRUkw9vATvCs7UHFTxoUMDb/floodsub"
"github.com/attic-labs/noms/go/d"
"github.com/attic-labs/noms/go/datas"
"github.com/attic-labs/noms/go/hash"
@@ -17,6 +15,7 @@ import (
"github.com/attic-labs/noms/go/merge"
"github.com/attic-labs/noms/go/types"
"github.com/attic-labs/noms/samples/go/ipfs-chat/dbg"
"gx/ipfs/QmZdsQf8BiCpAj61nz9NgqVeRUkw9vATvCs7UHFTxoUMDb/floodsub"
)
func Replicate(sub *floodsub.Subscription, source, dest datas.Dataset, didChange func(ds datas.Dataset)) {