Files
timeliner/itemfiles.go
T
Matthew Holt 0ca48fecfd Initial commit
2019-01-19 20:28:53 -07:00

284 lines
8.7 KiB
Go

package timeliner
import (
"crypto/sha256"
"database/sql"
"encoding/base64"
"fmt"
"hash"
"io"
"log"
mathrand "math/rand"
"os"
"path"
"path/filepath"
"regexp"
"strings"
"time"
)
// downloadItemFile ... TODO.
func (t *Timeline) downloadItemFile(src io.ReadCloser, dest *os.File, h hash.Hash) error {
if src == nil {
return fmt.Errorf("missing reader with which to download file")
}
if dest == nil {
return fmt.Errorf("missing file to download into")
}
// TODO: What if file already exists on disk (byte-for-byte)? - i.e. data_hash in DB has a duplicate
// give the hasher a copy of the file bytes
tr := io.TeeReader(src, h)
if _, err := io.Copy(dest, tr); err != nil {
os.Remove(dest.Name())
return fmt.Errorf("copying contents: %v", err)
}
if err := dest.Sync(); err != nil {
os.Remove(dest.Name())
return fmt.Errorf("syncing file: %v", err)
}
// TODO: If mime type is photo or video, extract most important EXIF data and return it for storage in DB?
return nil
}
// makeUniqueCanonicalItemDataFileName returns an available
// (non-overwriting) filename for the item's data file, starting
// with its plain, canonical data file name, then improvising
// and making unique if necessary. If there is no error, the
// return value is always a usable data file name.
// TODO: fix godoc
func (t *Timeline) openUniqueCanonicalItemDataFile(it Item, dataSourceID string) (*os.File, *string, error) {
if dataSourceID == "" {
return nil, nil, fmt.Errorf("missing service ID")
}
dir := t.canonicalItemDataFileDir(it, dataSourceID)
err := os.MkdirAll(t.fullpath(dir), 0700)
if err != nil {
return nil, nil, fmt.Errorf("making directory for data file: %v", err)
}
tryPath := path.Join(dir, t.canonicalItemDataFileName(it, dataSourceID))
lastAppend := path.Ext(tryPath)
for i := 0; i < 100; i++ {
fullFilePath := t.fullpath(filepath.FromSlash(tryPath))
f, err := os.OpenFile(fullFilePath, os.O_CREATE|os.O_RDWR|os.O_EXCL, 0600)
if os.IsExist(err) {
ext := path.Ext(tryPath)
tryPath = strings.TrimSuffix(tryPath, lastAppend)
lastAppend = fmt.Sprintf("_%d%s", i+1, ext) // start at 1, but actually 2 because existing file is "1"
tryPath += lastAppend
continue
}
if err != nil {
return nil, nil, fmt.Errorf("creating data file: %v", err)
}
return f, &tryPath, nil
}
return nil, nil, fmt.Errorf("unable to find available filename for item: %s", tryPath)
}
// canonicalItemDataFileName returns the plain, canonical name of the
// data file for the item. Canonical data file names are relative to
// the base storage (repo) path (i.e. the folder of the DB file). This
// function does no improvising in case of a name missing from the item,
// nor does it do uniqueness checks. If the item does not have enough
// information to generate a deterministic file name, the returned path
// will end with a trailing slash (i.e. the path's last component empty).
// Things considered deterministic for filename construction include the
// item's filename, the item's original ID, and its timestamp.
// TODO: fix godoc
func (t *Timeline) canonicalItemDataFileName(it Item, dataSourceID string) string {
// ideally, the filename is simply the one provided with the item
var filename string
if fname := it.DataFileName(); fname != nil {
filename = t.safePathComponent(*fname)
}
// otherwise, try a filename based on the item's ID
if filename == "" {
if itemOriginalID := it.ID(); itemOriginalID != "" {
filename = fmt.Sprintf("item_%s", itemOriginalID)
}
}
// otherwise, try a filename based on the item's timestamp
ts := it.Timestamp()
if filename == "" && !ts.IsZero() {
filename = ts.Format("2006_01_02_150405")
}
// otherwise, out of options; revert to a random string
// since no deterministic filename is available
if filename == "" {
filename = randomString(24, false)
}
// shorten the name if needed (thanks for everything, Windows)
return t.ensureDataFileNameShortEnough(filename)
}
func (t *Timeline) canonicalItemDataFileDir(it Item, dataSourceID string) string {
ts := it.Timestamp()
if ts.IsZero() {
ts = time.Now()
}
if dataSourceID == "" {
dataSourceID = "unknown_service"
}
// use "/" separators and adjust for the OS
// path separator when accessing disk
return path.Join("data",
fmt.Sprintf("%04d", ts.Year()),
fmt.Sprintf("%02d", ts.Month()),
t.safePathComponent(dataSourceID))
}
func (t *Timeline) ensureDataFileNameShortEnough(filename string) string {
// thanks for nothing, Windows
if len(filename) > 250 {
ext := path.Ext(filename)
if len(ext) > 20 { // arbitrary and unlikely, but just in case
ext = ext[:20]
}
filename = filename[:250-len(ext)]
filename += ext
}
return filename
}
// func ensureDataFileNameUnique(canonicalDataFileName string, maxTries int) (string, error) {
// if maxTries < 1 {
// panic("maxTries must be at least 1")
// }
// lastAppend := path.Ext(canonicalDataFileName)
// for i := 0; i < maxTries; i++ {
// if !datafileExists(canonicalDataFileName) {
// return canonicalDataFileName, nil
// }
// ext := path.Ext(canonicalDataFileName)
// canonicalDataFileName = strings.TrimSuffix(canonicalDataFileName, lastAppend)
// lastAppend = fmt.Sprintf("_%d%s", i+2, ext) // start at 1, but actually 2 because first file is "1"
// canonicalDataFileName += lastAppend
// }
// return "", fmt.Errorf("could not find an available filename for %s in %d iterations",
// canonicalDataFileName, maxTries)
// }
// TODO/NOTE: If changing a file name, all items with same data_hash must also be updated to use same file name
func (t *Timeline) replaceWithExisting(canonical *string, checksumBase64 string, itemRowID int64) error {
if canonical == nil || *canonical == "" || checksumBase64 == "" {
return fmt.Errorf("missing data filename and/or hash of contents")
}
var existingDatafile *string
err := t.db.QueryRow(`SELECT data_file FROM items
WHERE data_hash = ? AND id != ? LIMIT 1`,
checksumBase64, itemRowID).Scan(&existingDatafile)
if err == sql.ErrNoRows {
return nil // file is unique; carry on
}
if err != nil {
return fmt.Errorf("querying DB: %v", err)
}
// file is a duplicate!
if existingDatafile == nil {
// ... that's weird, how's this possible? it has a hash but no file name recorded
return fmt.Errorf("item with matching hash is missing data file name; hash: %s", checksumBase64)
}
// ensure the existing file is still the same
h := sha256.New()
f, err := os.Open(t.fullpath(*existingDatafile))
if err != nil {
return fmt.Errorf("opening existing file: %v", err)
}
defer f.Close()
_, err = io.Copy(h, f)
if err != nil {
return fmt.Errorf("checking file integrity: %v", err)
}
existingFileHash := h.Sum(nil)
b64ExistingFileHash := base64.StdEncoding.EncodeToString(existingFileHash)
// if the existing file was modified; restore it with
// what we just downloaded, which presumably succeeded
if checksumBase64 != b64ExistingFileHash {
log.Printf("[INFO] Restoring modified data file: %s was '%s' but is now '%s'",
*existingDatafile, checksumBase64, existingFileHash)
err := os.Rename(t.fullpath(*canonical), t.fullpath(*existingDatafile))
if err != nil {
return fmt.Errorf("replacing modified data file: %v", err)
}
}
// everything checks out; delete the newly-downloaded file
// and use the existing file instead of duplicating it
err = os.Remove(t.fullpath(*canonical))
if err != nil {
return fmt.Errorf("removing duplicate data file: %v", err)
}
canonical = existingDatafile
return nil
}
// randomString returns a string of n random characters.
// It is not even remotely secure or a proper distribution.
// But it's good enough for some things. It excludes certain
// confusing characters like I, l, 1, 0, O, etc. If sameCase
// is true, then uppercase letters are excluded.
func randomString(n int, sameCase bool) string {
if n <= 0 {
return ""
}
dict := []byte("abcdefghijkmnopqrstuvwxyzABCDEFGHJKLMNPQRTUVWXY23456789")
if sameCase {
dict = []byte("abcdefghijkmnpqrstuvwxyz0123456789")
}
b := make([]byte, n)
for i := range b {
b[i] = dict[mathrand.Int63()%int64(len(dict))]
}
return string(b)
}
func (t *Timeline) fullpath(canonicalDatafileName string) string {
return filepath.Join(t.repoDir, filepath.FromSlash(canonicalDatafileName))
}
func (t *Timeline) datafileExists(canonicalDatafileName string) bool {
_, err := os.Stat(t.fullpath(canonicalDatafileName))
return !os.IsNotExist(err)
}
func (t *Timeline) safePathComponent(s string) string {
s = safePathRE.ReplaceAllLiteralString(s, "")
s = strings.Replace(s, "..", "", -1)
if s == "." {
s = ""
}
return s
}
// safePathRER matches any undesirable characters in a filepath.
// Note that this allows dots, so you'll have to strip ".." manually.
var safePathRE = regexp.MustCompile(`[^\w.-]`)