From 0c7051c937022e2fce22604c86ea286d8203f3f9 Mon Sep 17 00:00:00 2001 From: sblinch Date: Wed, 13 Mar 2019 17:00:24 -0700 Subject: [PATCH] Added support for saving the index to a simple JSON file after each indexing job, and loading it (followed by a quick refresh) at the next startup --- bookbrowser.go | 1 + indexer/indexer.go | 137 +++++++++++++++++++++++++++++++++++++++---- indexer/seencache.go | 48 +++++++++++++++ server/server.go | 14 +++++ 4 files changed, 190 insertions(+), 10 deletions(-) create mode 100644 indexer/seencache.go diff --git a/bookbrowser.go b/bookbrowser.go index 6fc7dba6..249e4cd3 100644 --- a/bookbrowser.go +++ b/bookbrowser.go @@ -117,6 +117,7 @@ func main() { s := server.NewServer(*addr, *bookdir, *tempdir, curversion, true, *nocovers) go func() { + s.LoadBookIndex() s.RefreshBookIndex() if len(s.Indexer.BookList()) == 0 { log.Fatalln("Fatal error: no books found") diff --git a/indexer/indexer.go b/indexer/indexer.go index 3fbbd82a..53c3cccb 100644 --- a/indexer/indexer.go +++ b/indexer/indexer.go @@ -11,9 +11,10 @@ import ( "github.com/geek1011/BookBrowser/booklist" "github.com/geek1011/BookBrowser/formats" - zglob "github.com/mattn/go-zglob" + "github.com/mattn/go-zglob" "github.com/nfnt/resize" "github.com/pkg/errors" + "encoding/json" ) type Indexer struct { @@ -25,6 +26,7 @@ type Indexer struct { booklist booklist.BookList mu sync.Mutex indMu sync.Mutex + seen *SeenCache } func New(paths []string, coverpath *string, exts []string) (*Indexer, error) { @@ -45,7 +47,79 @@ func New(paths []string, coverpath *string, exts []string) (*Indexer, error) { cp = &p } - return &Indexer{paths: paths, coverpath: cp, exts: exts}, nil + return &Indexer{paths: paths, coverpath: cp, exts: exts, seen: NewSeenCache()}, nil +} + +func (i *Indexer) Load() error { + i.indMu.Lock() + defer i.indMu.Unlock() + + booklist := booklist.BookList{} + + jsonFilename := filepath.Join(*i.coverpath, "index.json") + f, err := os.Open(jsonFilename) + if err != nil { + if os.IsNotExist(err) { + return nil + } else { + return errors.Wrap(err, "could not open index cache file") + } + } + dec := json.NewDecoder(f) + err = dec.Decode(&booklist) + if err != nil { + return errors.Wrap(err, "could not decode index cache file") + } + seen := NewSeenCache() + for index, b := range booklist { + seen.Add(b.FilePath, b.FileSize, b.ModTime, index) + } + + if i.Verbose { + log.Printf("Loaded %d items from index cache", len(booklist)) + } + + i.mu.Lock() + i.booklist = booklist + i.seen = seen + i.mu.Unlock() + + return nil +} + +func (i *Indexer) Save() error { + i.indMu.Lock() + defer i.indMu.Unlock() + + i.mu.Lock() + booklist := i.booklist + i.mu.Unlock() + + tmpFilename := filepath.Join(*i.coverpath, ".index.json.tmp") + jsonFilename := filepath.Join(*i.coverpath, "index.json") + f, err := os.Create(tmpFilename) + if err != nil { + f.Close() + return errors.Wrap(err, "could not create index cache temporary file") + } + + enc := json.NewEncoder(f) + err = enc.Encode(&booklist) + if err != nil { + f.Close() + return errors.Wrap(err, "could not encode index cache file") + } + + err = os.Rename(tmpFilename, jsonFilename) + if err != nil { + return errors.Wrap(err, "could not replace index cache file with temporary file") + } + + if i.Verbose { + log.Printf("Saved %d items to index cache", len(booklist)) + } + + return nil } func (i *Indexer) Refresh() ([]error, error) { @@ -62,8 +136,15 @@ func (i *Indexer) Refresh() ([]error, error) { return errs, errors.New("no paths to index") } - booklist := booklist.BookList{} - seen := map[string]bool{} + // seenID may be redundant at this point given that SeenCache does essentially the same thing, but + // seenCache is based on the mtime/size/filename of each book (for performance), whereas seenID is based on + // the file hash + seenID := map[string]bool{} + seen := NewSeenCache() + + i.mu.Lock() + bl := i.booklist + i.mu.Unlock() filenames := []string{} for _, path := range i.paths { @@ -81,29 +162,65 @@ func (i *Indexer) Refresh() ([]error, error) { } } + exists := make([]bool, len(bl), len(bl)) + for fi, filepath := range filenames { if i.Verbose { log.Printf("Indexing %s", filepath) } - book, err := i.getBook(filepath) + stat, err := os.Stat(filepath) if err != nil { - errs = append(errs, errors.Wrapf(err, "error reading book '%s'", filepath)) + errs = append(errs, errors.Wrapf(err, "cannot stat file '%s'", filepath)) if i.Verbose { log.Printf("--> Error: %v", errs[len(errs)-1]) } continue } - if !seen[book.ID()] { - booklist = append(booklist, book) - seen[book.ID()] = true + + var book *booklist.Book + hash := i.seen.Hash(filepath, stat.Size(), stat.ModTime()) + haveSeen, blIndex := i.seen.SeenHash(hash) + if haveSeen { + exists[blIndex] = true + seen.AddHash(hash, blIndex) + if i.Verbose { + log.Printf("Already seen; not reindexing") + } + } else { + // TODO: pass stat variable to i.getBook() to avoid a duplicate os.Stat() for each book + book, err = i.getBook(filepath) + if err != nil { + errs = append(errs, errors.Wrapf(err, "error reading book '%s'", filepath)) + if i.Verbose { + log.Printf("--> Error: %v", errs[len(errs)-1]) + } + continue + } + if !seenID[book.ID()] { + bl = append(bl, book) + seenID[book.ID()] = true + blIndex = len(bl) - 1 + seen.AddHash(hash, blIndex) + } } i.Progress = float64(fi+1) / float64(len(filenames)) } + // remove any books that have disappeared since our last indexing job + lastEntry := len(bl)-1 + for index, stillExists := range exists { + if !stillExists { + bl[index] = bl[lastEntry] + lastEntry-- + } + } + bl = bl[0:lastEntry+1] + i.mu.Lock() - i.booklist = booklist + i.booklist = bl + i.seen = seen i.mu.Unlock() return errs, nil diff --git a/indexer/seencache.go b/indexer/seencache.go new file mode 100644 index 00000000..97a16dcc --- /dev/null +++ b/indexer/seencache.go @@ -0,0 +1,48 @@ +package indexer + +import ( + "fmt" + "crypto/sha1" + "time" +) + +type SeenCache struct { + seen map[string]int +} + +func NewSeenCache() *SeenCache { + return &SeenCache{seen: make(map[string]int)} +} + +func (c *SeenCache) Hash(filePath string, fileSize int64, modTime time.Time) string { + token := fmt.Sprintf("%08d|%s|%s",fileSize,modTime,filePath) + return fmt.Sprintf("%x", sha1.Sum([]byte(token)))[:10] +} + +func (c *SeenCache) Clear() { + c.seen = make(map[string]int) +} + +func (c *SeenCache) Add(filePath string, fileSize int64, modTime time.Time, index int) string { + hash := c.Hash(filePath, fileSize, modTime) + c.seen[hash] = index + return hash +} + +func (c *SeenCache) AddHash(hash string, index int) { + c.seen[hash] = index +} + +func (c *SeenCache) Seen(filePath string, fileSize int64, modTime time.Time) (bool, string, int) { + hash := c.Hash(filePath, fileSize, modTime) + if index, exists := c.seen[hash]; exists { + return true, hash, index + } else { + return false, "", -1 + } +} + +func (c *SeenCache) SeenHash(hash string) (bool, int) { + index, exists := c.seen[hash] + return exists, index +} \ No newline at end of file diff --git a/server/server.go b/server/server.go index ed6447f7..1fe33045 100644 --- a/server/server.go +++ b/server/server.go @@ -75,6 +75,13 @@ func (s *Server) printLog(format string, v ...interface{}) { } } +func (s *Server) LoadBookIndex() error { + return s.Indexer.Load() +} +func (s *Server) SaveBookIndex() error { + return s.Indexer.Save() +} + // RefreshBookIndex refreshes the book index func (s *Server) RefreshBookIndex() error { errs, err := s.Indexer.Refresh() @@ -91,6 +98,13 @@ func (s *Server) RefreshBookIndex() error { } debug.FreeOSMemory() + + err = s.Indexer.Save() + if err != nil { + log.Printf("Error saving index: %s",err) + return err + } + return nil }