Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,59 @@ matches, err := model.CosN(expr, 1)
if err != nil {
log.Fatalf("error evaluating cosine similarity: %v", err)
}

// Create two expressions.
x := word2vec.Expr{"king": 1.0}
y := word2vec.Expr{"queen": 1.0}

// Compute similarity between the expressions
cosineSimilarity, err := m.Cos(x, y)
if err != nil {
log.Fatalf("error evaluating cosine similarity: %v", err)
}

```

If you only wanna to compute similarity between some words, but not to find the n most similar of a given word,
you can use a lazy model, which initializes faster and uses less memory:

```go
// Lazy load the model from an io.Reader (i.e. a file).
model, err := word2vec.LazyFromReader(r)
if err != nil {
log.Fatalf("error loading model: %v", err)
}

// Create two expressions.
x := word2vec.Expr{"king": 1.0}
y := word2vec.Expr{"queen": 1.0}

// Compute similarity between the expressions
cosineSimilarity, err := m.Cos(x, y)
if err != nil {
log.Fatalf("error evaluating cosine similarity: %v", err)
}

```

Below is a benchmark between the normal and the lazy model using a model file with 456.976 (26 ^ 4) words of 300 dimensions.
You can run the benchmark with your model declaring the var ```filename``` of ```word2vec_bench_test.go``` to the path of your binary model.


```
BenchmarkLazyModel
BenchmarkLazyModel/InitializeEager
BenchmarkLazyModel/InitializeEager-4 1 4217573971 ns/op 1193506272 B/op 1370948 allocs/op
BenchmarkLazyModel/InitializeLazy
BenchmarkLazyModel/InitializeLazy-4 4 321390344 ns/op 31409760 B/op 456996 allocs/op
BenchmarkLazyModel/LoadVectorEager
BenchmarkLazyModel/LoadVectorEager-4 4769024 266.2 ns/op 400 B/op 2 allocs/op
BenchmarkLazyModel/LoadVectorLazy
BenchmarkLazyModel/LoadVectorLazy-4 151197 8163 ns/op 3032 B/op 6 allocs/op

4217573971ns = 4,21s
321390344ns = 0,32s
1193506272B = 1,19GB
31409760B = 0,03GB(31MB)

```
17 changes: 15 additions & 2 deletions cmd/word-server/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@ import (
)

var listen, modelPath string
var lazy bool

func init() {
flag.StringVar(&listen, "listen", "localhost:1234", "bind `address` for HTTP server")
flag.StringVar(&modelPath, "model", "", "`path` to binary model data")
flag.BoolVar(&lazy, "lazy", false, "whether to lazy load model, without cos-n support, defaults to true, which eager load it")
}

func main() {
Expand All @@ -28,17 +30,28 @@ func main() {
os.Exit(1)
}

log.Println("Loading model...")
if lazy {
log.Println("Lazy loading model...")
} else {
log.Println("Loading model...")
}
f, err := os.Open(modelPath)
if err != nil {
fmt.Printf("error opening binary model data file: %v\n", err)
os.Exit(1)
}
defer f.Close()

m, err := word2vec.FromReader(f)
var m *word2vec.Model

if lazy {
m, err = word2vec.LazyFromReader(f)
} else {
m, err = word2vec.FromReader(f)
}
if err != nil {
fmt.Printf("error reading binary model data: %v\n", err)
f.Close()
os.Exit(1)
}

Expand Down
60 changes: 60 additions & 0 deletions offset_counter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package word2vec

import "bufio"

type offsetCounter struct {
offset int64
reader *bufio.Reader
}

func (o *offsetCounter) Discard(n int) (discarded int, err error) {
discarded, err = o.reader.Discard(n)
o.offset += int64(discarded)
return
}

func (o *offsetCounter) ReadByte() (byte, error) {
b, err := o.reader.ReadByte()
if err == nil {
o.offset++
}
return b, err
}

func (o *offsetCounter) ReadRune() (r rune, size int, err error) {
r, size, err = o.reader.ReadRune()
o.offset += int64(size)
return
}

func (o *offsetCounter) ReadSlice(delim byte) (line []byte, err error) {
line, err = o.reader.ReadSlice(delim)
o.offset += int64(len(line))
return
}

func (o *offsetCounter) ReadBytes(delim byte) ([]byte, error) {
b, err := o.reader.ReadBytes(delim)
o.offset += int64(len(b))
return b, err
}

func (o *offsetCounter) ReadString(delim byte) (string, error) {
s, err := o.reader.ReadString(delim)
o.offset += int64(len(s))
return s, err
}

func (o *offsetCounter) Read(p []byte) (n int, err error) {
n, err = o.reader.Read(p)
o.offset += int64(n)
return
}

func (o *offsetCounter) UnreadByte() error {
err := o.reader.UnreadByte()
if err == nil {
o.offset--
}
return err
}
131 changes: 121 additions & 10 deletions word2vec.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,31 @@ package word2vec // import "code.sajari.com/word2vec"
import (
"bufio"
"encoding/binary"
"errors"
"fmt"
"io"
"log"
"sync"
)

// Model is a type which represents a word2vec Model and implements the Coser
// and Mapper interfaces.
type Model struct {
dim int
words map[string]Vector
dim int
words map[string]Vector
lazyWords map[string]int64
reader LazyReader
}

var (
_ Coser = (*Model)(nil)
_ Mapper = (*Model)(nil)
)

// FromReader creates a Model using the binary model data provided by the io.Reader.
// FromReader creates a Model using the binary model data provided by the
// io.Reader. It loads all vectors on memory for faster access and to be able to
// find n most similar words but uses more memory and takes longer to initialize
// If you don't need to find n most similar words, consider using LazyFromReader
func FromReader(r io.Reader) (*Model, error) {
br := bufio.NewReader(r)
var size, dim int
Expand Down Expand Up @@ -73,6 +80,65 @@ func FromReader(r io.Reader) (*Model, error) {
return m, nil
}

const f32Len = 4

type LazyReader interface {
io.Reader
io.ReaderAt
}

// LazyFromReader creates a lazy Model using the binary model data provided by
// the io.Reader. Returns an error on CosN and MultiCosN(Model) If you need CosN
// and MultiCosN(Model), use FromReader It loads vectors from the underlying
// io.ReaderAt, usually *os.File, only when needed, making it slower but using
// less memory and initializing faster
func LazyFromReader(r LazyReader) (*Model, error) {
br := &offsetCounter{reader: bufio.NewReader(r)}
var size, dim int
n, err := fmt.Fscanln(br, &size, &dim)
if err != nil {
return nil, err
}
if n != 2 {
return nil, fmt.Errorf("could not extract size/dim from binary model data")
}

m := &Model{
lazyWords: make(map[string]int64, size),
dim: dim,
reader: r,
}

for i := 0; i < size; i++ {
w, err := br.ReadString(' ')
if err != nil {
return nil, err
}
w = w[:len(w)-1]

m.lazyWords[w] = br.offset

_, err = br.Discard(dim * f32Len)
if err != nil {
return nil, err
}

b, err := br.ReadByte()
if err != nil {
if i == size-1 && err == io.EOF {
break
}
return nil, err
}
if b != byte('\n') {
if err := br.UnreadByte(); err != nil {
return nil, err
}
}
}
return m, nil
}

// Vector is a type which represents a word vector.
type Vector []float32

Expand Down Expand Up @@ -137,6 +203,9 @@ type Coser interface {

// Size returns the number of words in the model.
func (m *Model) Size() int {
if m.lazyWords != nil {
return len(m.lazyWords)
}
return len(m.words)
}

Expand All @@ -155,9 +224,28 @@ type Mapper interface {
// Unknown words are ignored.
func (m *Model) Map(words []string) map[string]Vector {
result := make(map[string]Vector)
for _, w := range words {
if v, ok := m.words[w]; ok {
result[w] = v
if m.words != nil {
for _, w := range words {
if v, ok := m.words[w]; ok {
result[w] = v
}
}
} else {
for _, w := range words {
if off, ok := m.lazyWords[w]; ok {

r := io.NewSectionReader(m.reader, off, int64(m.dim*f32Len))

v := make(Vector, m.dim)
if err := binary.Read(r, binary.LittleEndian, v); err != nil {
log.Printf("word2vec: LazyModel.Map read %s %s", w, err) //todo return the error and change the api?
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What should I do here?

continue
}

v.Normalise()

result[w] = v
}
}
}
return result
Expand Down Expand Up @@ -196,11 +284,25 @@ func (m *Model) Coses(pairs [][2]Expr) ([]float32, error) {
func (m *Model) Eval(expr Expr) (Vector, error) {
v := Vector(make([]float32, m.dim))
for w, c := range expr {
u, ok := m.words[w]
if !ok {
return nil, &NotFoundError{w}
if m.words != nil {
u, ok := m.words[w]
if !ok {
return nil, &NotFoundError{w}
}
v.Add(c, u)
} else {
off, ok := m.lazyWords[w]
if !ok {
return nil, &NotFoundError{w}
}
r := io.NewSectionReader(m.reader, off, int64(m.dim*f32Len))

u := Vector(make([]float32, m.dim))
if err := binary.Read(r, binary.LittleEndian, u); err != nil {
return nil, err
}
v.Add(c, u)
}
v.Add(c, u)
}
v.Normalise()
return v, nil
Expand All @@ -216,6 +318,10 @@ type Match struct {
// CosN computes the n most similar words to the expression. Returns an error if the
// expression could not be evaluated.
func (m *Model) CosN(e Expr, n int) ([]Match, error) {
if m.words == nil {
return nil, errors.New("CosN not supported on lazy model")
}

if n == 0 {
return nil, nil
}
Expand Down Expand Up @@ -278,6 +384,11 @@ type multiMatches struct {
// MultiCosN takes a list of expressions and computes the
// n most similar words for each.
func MultiCosN(m *Model, exprs []Expr, n int) ([][]Match, error) {

if m.words == nil {
return nil, errors.New("MultiCosN not supported on lazy model")
}

if n == 0 {
return make([][]Match, len(exprs)), nil
}
Expand Down
Loading