Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ data/embeddings-130441-* filter=lfs diff=lfs merge=lfs -text
*.txt filter=lfs diff=lfs merge=lfs -text
data/embeddings-* filter=lfs diff=lfs merge=lfs -text
data/embeddings-53708-* filter=lfs diff=lfs merge=lfs -text
regina_files/classifier_data/model.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
regina_files/language_data/model.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
438 changes: 437 additions & 1 deletion datasets/yelp/regina-data/sentiment.dev.short.0

Large diffs are not rendered by default.

100 changes: 100 additions & 0 deletions regina_files/classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
import tensorflow as tf

from regina_files.nn import cnn
from regina_files.vocab import Vocabulary


class Model(object):
def __init__(self, vocab):
dim_emb = 100
filter_sizes = [int(x) for x in [3, 4, 5]]
n_filters = 128

self.dropout = tf.placeholder(tf.float32, name='dropout')
self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
self.x = tf.placeholder(tf.int32, [None, None], # batch_size * max_len
name='x')
self.y = tf.placeholder(tf.float32, [None], name='y')

embedding = tf.get_variable('embedding', [vocab.size, dim_emb])
x = tf.nn.embedding_lookup(embedding, self.x)
self.logits = cnn(x, filter_sizes, n_filters, self.dropout, 'cnn')
self.probs = tf.sigmoid(self.logits)

loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=self.logits)
self.loss = tf.reduce_mean(loss)
self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

self.saver = tf.train.Saver()


def load_model(sess, model):
saver_dir = './regina_files/classifier_data/'
print('Loading classifier model from {}'.format(saver_dir))
checkpoint_path = tf.train.get_checkpoint_state(saver_dir)
if checkpoint_path is not None:
model.saver.restore(sess, checkpoint_path.model_checkpoint_path)
else:
raise Exception('no classifier model')
return model


def evaluate(sess, batch_size, vocab, model, x, y):
probs = []
batches = get_batches(x, y, vocab.word2id, batch_size)
for batch in batches:
p = sess.run(model.probs, feed_dict={model.x: batch['x'], model.dropout: 1})
probs += p.tolist()
y_hat = [p > 0.5 for p in probs]
same = [p == q for p, q in zip(y, y_hat)]
return 100.0 * sum(same) / len(y), probs


def get_batches(x, y, word2id, batch_size, min_len=5):
pad = word2id['<pad>']
unk = word2id['<unk>']

batches = []
s = 0
while s < len(x):
t = min(s + batch_size, len(x))

_x = []
max_len = max([len(sent) for sent in x[s:t]])
max_len = max(max_len, min_len)
for sent in x[s:t]:
sent_id = [word2id[w] if w in word2id else unk for w in sent]
padding = [pad] * (max_len - len(sent))
_x.append(padding + sent_id)

batches.append({'x': _x, 'y': y[s:t]})
s = t

return batches


def prepare(sentences):
x = [' '.join(s) for s in sentences]
y = [1] * len(x)
z = sorted(zip(x, y), key=lambda i: len(i[0]))
return list(zip(*z))


class Classifier:
def __init__(self, batch_size=100):
self.vocab = Vocabulary('./regina_files/classifier_data/yelp.vocab')
print('vocabulary size', self.vocab.size)
self.batch_size = batch_size
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
c_language = tf.Graph()
with c_language.as_default():
model = Model(self.vocab)
self.sess = tf.Session(config=config, graph=c_language)
self.model = load_model(self.sess, model)

def test(self, sentences):
test_x, test_y = prepare(sentences)
acc, _ = evaluate(self.sess, self.batch_size, self.vocab, self.model, test_x, test_y)
print('dev accuracy %.2f' % acc)
return acc
2 changes: 2 additions & 0 deletions regina_files/classifier_data/checkpoint
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
model_checkpoint_path: "model"
all_model_checkpoint_paths: "model"
3 changes: 3 additions & 0 deletions regina_files/classifier_data/model.data-00000-of-00001
Git LFS file not shown
Binary file added regina_files/classifier_data/model.index
Binary file not shown.
Binary file added regina_files/classifier_data/model.meta
Binary file not shown.
Binary file added regina_files/classifier_data/yelp.vocab
Binary file not shown.
55 changes: 55 additions & 0 deletions regina_files/file_io.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from nltk import word_tokenize, sent_tokenize


def load_doc(path):
data = []
with open(path) as f:
for line in f:
sents = sent_tokenize(line)
doc = [word_tokenize(sent) for sent in sents]
data.append(doc)
return data


def load_sent(path, max_size=-1):
data = []
with open(path) as f:
for line in f:
if len(data) == max_size:
break
data.append(line.split())
return data


def load_vec(path):
x = []
with open(path) as f:
for line in f:
p = line.split()
p = [float(v) for v in p]
x.append(p)
return x


def write_doc(docs, sents, path):
with open(path, 'w') as f:
index = 0
for doc in docs:
for i in range(len(doc)):
f.write(' '.join(sents[index]))
f.write('\n' if i == len(doc) - 1 else ' ')
index += 1


def write_sent(sents, path):
with open(path, 'w') as f:
for sent in sents:
f.write(' '.join(sent) + '\n')


def write_vec(vecs, path):
with open(path, 'w') as f:
for vec in vecs:
for i, x in enumerate(vec):
f.write('%.3f' % x)
f.write('\n' if i == len(vec) - 1 else ' ')
2 changes: 2 additions & 0 deletions regina_files/language_data/checkpoint
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
model_checkpoint_path: "model"
all_model_checkpoint_paths: "model"
3 changes: 3 additions & 0 deletions regina_files/language_data/model.data-00000-of-00001
Git LFS file not shown
Binary file added regina_files/language_data/model.index
Binary file not shown.
Binary file added regina_files/language_data/model.meta
Binary file not shown.
Binary file added regina_files/language_data/yelp.vocab
Binary file not shown.
110 changes: 110 additions & 0 deletions regina_files/language_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
from regina_files.nn import *
from regina_files.vocab import Vocabulary
import numpy as np


class Model(object):
def __init__(self, vocab):
dim_z = 500
n_layers = 1

self.dropout = tf.placeholder(tf.float32, name='dropout')
self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
self.batch_size = tf.placeholder(tf.int32, name='batch_size')
self.inputs = tf.placeholder(tf.int32, [None, None], # batch_size * max_len
name='inputs')
self.targets = tf.placeholder(tf.int32, [None, None], name='targets')
self.weights = tf.placeholder(tf.float32, [None, None], name='weights')

embedding = tf.get_variable('embedding', initializer=vocab.embedding.astype(np.float32))
with tf.variable_scope('projection'):
proj_W = tf.get_variable('W', [dim_z, vocab.size])
proj_b = tf.get_variable('b', [vocab.size])

inputs = tf.nn.embedding_lookup(embedding, self.inputs)
cell = create_cell(dim_z, n_layers, self.dropout)
outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, scope='language_model')
outputs = tf.nn.dropout(outputs, self.dropout)
outputs = tf.reshape(outputs, [-1, dim_z])
self.logits = tf.matmul(outputs, proj_W) + proj_b

loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.targets, [-1]), logits=self.logits)
loss *= tf.reshape(self.weights, [-1])
self.tot_loss = tf.reduce_sum(loss)
self.sent_loss = self.tot_loss / tf.to_float(self.batch_size)

self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.sent_loss)

self.saver = tf.train.Saver()


def load_model(sess, model):
saver_dir = './regina_files/language_data/'
print('Loading language model from {}'.format(saver_dir))
checkpoint_path = tf.train.get_checkpoint_state(saver_dir)
if checkpoint_path is not None:
model.saver.restore(sess, checkpoint_path.model_checkpoint_path)
else:
raise Exception('no language model')
return model


def get_lm_batches(x, word2id, batch_size):
pad = word2id['<pad>']
go = word2id['<go>']
eos = word2id['<eos>']
unk = word2id['<unk>']

x = sorted(x, key=lambda i: len(i))

batches = []
s = 0
while s < len(x):
t = min(s + batch_size, len(x))

go_x, x_eos, weights = [], [], []
max_len = max([len(sent) for sent in x[s:t]])
for sent in x[s:t]:
sent_id = [word2id[w] if w in word2id else unk for w in sent]
l = len(sent)
padding = [pad] * (max_len - l)
go_x.append([go] + sent_id + padding)
x_eos.append(sent_id + [eos] + padding)
weights.append([1.0] * (l + 1) + [0.0] * (max_len - l))

batches.append({'inputs': go_x, 'targets': x_eos, 'weights': weights, 'size': t - s})
s = t

return batches


def evaluate(sess, batch_size, vocab, model, x):
batches = get_lm_batches(x, vocab.word2id, batch_size)
tot_loss, n_words = 0, 0

for batch in batches:
tot_loss += sess.run(model.tot_loss, feed_dict={model.batch_size: batch['size'], model.inputs: batch['inputs'],
model.targets: batch['targets'],
model.weights: batch['weights'], model.dropout: 1})
n_words += np.sum(batch['weights'])

return np.exp(tot_loss / n_words)


class LanguageModel:
def __init__(self, batch_size=100):
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
self.batch_size = batch_size
self.vocab = Vocabulary('./regina_files/language_data/yelp.vocab', 100)
print('vocabulary size', self.vocab.size)
g_language = tf.Graph()
with g_language.as_default():
model = Model(self.vocab)
self.sess = tf.Session(config=config, graph=g_language)
self.model = load_model(self.sess, model)

def test(self, sentences):
ppl = evaluate(self.sess, self.batch_size, self.vocab, self.model, sentences)
print('dev perplexity %.2f' % ppl)
return ppl
Loading