creating 3 different graphs #28

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

ashual wants to merge 1 commit into master from add-classifier-test

.gitattributes

-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,5 @@ data/embeddings-130441-* filter=lfs diff=lfs merge=lfs -text @@
     *.txt filter=lfs diff=lfs merge=lfs -text
     data/embeddings-* filter=lfs diff=lfs merge=lfs -text
     data/embeddings-53708-* filter=lfs diff=lfs merge=lfs -text
+    regina_files/classifier_data/model.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text
+    regina_files/language_data/model.data-00000-of-00001 filter=lfs diff=lfs merge=lfs -text

datasets/yelp/regina-data/sentiment.dev.short.0

Large diffs are not rendered by default.

regina_files/classifier.py

-Original file line number
+Diff line change
@@ -0,0 +1,100 @@
+    import tensorflow as tf
+    from regina_files.nn import cnn
+    from regina_files.vocab import Vocabulary
+    class Model(object):
+        def __init__(self, vocab):
+            dim_emb = 100
+            filter_sizes = [int(x) for x in [3, 4, 5]]
+            n_filters = 128
+            self.dropout = tf.placeholder(tf.float32, name='dropout')
+            self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
+            self.x = tf.placeholder(tf.int32, [None, None],  # batch_size * max_len
+                                    name='x')
+            self.y = tf.placeholder(tf.float32, [None], name='y')
+            embedding = tf.get_variable('embedding', [vocab.size, dim_emb])
+            x = tf.nn.embedding_lookup(embedding, self.x)
+            self.logits = cnn(x, filter_sizes, n_filters, self.dropout, 'cnn')
+            self.probs = tf.sigmoid(self.logits)
+            loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.y, logits=self.logits)
+            self.loss = tf.reduce_mean(loss)
+            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
+            self.saver = tf.train.Saver()
+    def load_model(sess, model):
+        saver_dir = './regina_files/classifier_data/'
+        print('Loading classifier model from {}'.format(saver_dir))
+        checkpoint_path = tf.train.get_checkpoint_state(saver_dir)
+        if checkpoint_path is not None:
+            model.saver.restore(sess, checkpoint_path.model_checkpoint_path)
+        else:
+            raise Exception('no classifier model')
+        return model
+    def evaluate(sess, batch_size, vocab, model, x, y):
+        probs = []
+        batches = get_batches(x, y, vocab.word2id, batch_size)
+        for batch in batches:
+            p = sess.run(model.probs, feed_dict={model.x: batch['x'], model.dropout: 1})
+            probs += p.tolist()
+        y_hat = [p > 0.5 for p in probs]
+        same = [p == q for p, q in zip(y, y_hat)]
+        return 100.0 * sum(same) / len(y), probs
+    def get_batches(x, y, word2id, batch_size, min_len=5):
+        pad = word2id['<pad>']
+        unk = word2id['<unk>']
+        batches = []
+        s = 0
+        while s < len(x):
+            t = min(s + batch_size, len(x))
+            _x = []
+            max_len = max([len(sent) for sent in x[s:t]])
+            max_len = max(max_len, min_len)
+            for sent in x[s:t]:
+                sent_id = [word2id[w] if w in word2id else unk for w in sent]
+                padding = [pad] * (max_len - len(sent))
+                _x.append(padding + sent_id)
+            batches.append({'x': _x, 'y': y[s:t]})
+            s = t
+        return batches
+    def prepare(sentences):
+        x = [' '.join(s) for s in sentences]
+        y = [1] * len(x)
+        z = sorted(zip(x, y), key=lambda i: len(i[0]))
+        return list(zip(*z))
+    class Classifier:
+        def __init__(self, batch_size=100):
+            self.vocab = Vocabulary('./regina_files/classifier_data/yelp.vocab')
+            print('vocabulary size', self.vocab.size)
+            self.batch_size = batch_size
+            config = tf.ConfigProto()
+            config.gpu_options.allow_growth = True
+            c_language = tf.Graph()
+            with c_language.as_default():
+                model = Model(self.vocab)
+                self.sess = tf.Session(config=config, graph=c_language)
+                self.model = load_model(self.sess, model)
+        def test(self, sentences):
+            test_x, test_y = prepare(sentences)
+            acc, _ = evaluate(self.sess, self.batch_size, self.vocab, self.model, test_x, test_y)
+            print('dev accuracy %.2f' % acc)
+            return acc

regina_files/classifier_data/checkpoint

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		model_checkpoint_path: "model"
		all_model_checkpoint_paths: "model"

regina_files/classifier_data/model.data-00000-of-00001

Git LFS file not shown

regina_files/classifier_data/model.index

Binary file not shown.

regina_files/classifier_data/model.meta

Binary file not shown.

regina_files/classifier_data/yelp.vocab

Binary file not shown.

regina_files/file_io.py

-Original file line number
+Diff line change
@@ -0,0 +1,55 @@
+    from nltk import word_tokenize, sent_tokenize
+    def load_doc(path):
+        data = []
+        with open(path) as f:
+            for line in f:
+                sents = sent_tokenize(line)
+                doc = [word_tokenize(sent) for sent in sents]
+                data.append(doc)
+        return data
+    def load_sent(path, max_size=-1):
+        data = []
+        with open(path) as f:
+            for line in f:
+                if len(data) == max_size:
+                    break
+                data.append(line.split())
+        return data
+    def load_vec(path):
+        x = []
+        with open(path) as f:
+            for line in f:
+                p = line.split()
+                p = [float(v) for v in p]
+                x.append(p)
+        return x
+    def write_doc(docs, sents, path):
+        with open(path, 'w') as f:
+            index = 0
+            for doc in docs:
+                for i in range(len(doc)):
+                    f.write(' '.join(sents[index]))
+                    f.write('\n' if i == len(doc) - 1 else ' ')
+                    index += 1
+    def write_sent(sents, path):
+        with open(path, 'w') as f:
+            for sent in sents:
+                f.write(' '.join(sent) + '\n')
+    def write_vec(vecs, path):
+        with open(path, 'w') as f:
+            for vec in vecs:
+                for i, x in enumerate(vec):
+                    f.write('%.3f' % x)
+                    f.write('\n' if i == len(vec) - 1 else ' ')

regina_files/language_data/checkpoint

Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		model_checkpoint_path: "model"
		all_model_checkpoint_paths: "model"

regina_files/language_data/model.data-00000-of-00001

Git LFS file not shown

regina_files/language_data/model.index

Binary file not shown.

regina_files/language_data/model.meta

Binary file not shown.

regina_files/language_data/yelp.vocab

Binary file not shown.

regina_files/language_model.py

-Original file line number
+Diff line change
@@ -0,0 +1,110 @@
+    from regina_files.nn import *
+    from regina_files.vocab import Vocabulary
+    import numpy as np
+    class Model(object):
+        def __init__(self, vocab):
+            dim_z = 500
+            n_layers = 1
+            self.dropout = tf.placeholder(tf.float32, name='dropout')
+            self.learning_rate = tf.placeholder(tf.float32, name='learning_rate')
+            self.batch_size = tf.placeholder(tf.int32, name='batch_size')
+            self.inputs = tf.placeholder(tf.int32, [None, None],  # batch_size * max_len
+                                         name='inputs')
+            self.targets = tf.placeholder(tf.int32, [None, None], name='targets')
+            self.weights = tf.placeholder(tf.float32, [None, None], name='weights')
+            embedding = tf.get_variable('embedding', initializer=vocab.embedding.astype(np.float32))
+            with tf.variable_scope('projection'):
+                proj_W = tf.get_variable('W', [dim_z, vocab.size])
+                proj_b = tf.get_variable('b', [vocab.size])
+            inputs = tf.nn.embedding_lookup(embedding, self.inputs)
+            cell = create_cell(dim_z, n_layers, self.dropout)
+            outputs, _ = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32, scope='language_model')
+            outputs = tf.nn.dropout(outputs, self.dropout)
+            outputs = tf.reshape(outputs, [-1, dim_z])
+            self.logits = tf.matmul(outputs, proj_W) + proj_b
+            loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape(self.targets, [-1]), logits=self.logits)
+            loss *= tf.reshape(self.weights, [-1])
+            self.tot_loss = tf.reduce_sum(loss)
+            self.sent_loss = self.tot_loss / tf.to_float(self.batch_size)
+            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.sent_loss)
+            self.saver = tf.train.Saver()
+    def load_model(sess, model):
+        saver_dir = './regina_files/language_data/'
+        print('Loading language model from {}'.format(saver_dir))
+        checkpoint_path = tf.train.get_checkpoint_state(saver_dir)
+        if checkpoint_path is not None:
+            model.saver.restore(sess, checkpoint_path.model_checkpoint_path)
+        else:
+            raise Exception('no language model')
+        return model
+    def get_lm_batches(x, word2id, batch_size):
+        pad = word2id['<pad>']
+        go = word2id['<go>']
+        eos = word2id['<eos>']
+        unk = word2id['<unk>']
+        x = sorted(x, key=lambda i: len(i))
+        batches = []
+        s = 0
+        while s < len(x):
+            t = min(s + batch_size, len(x))
+            go_x, x_eos, weights = [], [], []
+            max_len = max([len(sent) for sent in x[s:t]])
+            for sent in x[s:t]:
+                sent_id = [word2id[w] if w in word2id else unk for w in sent]
+                l = len(sent)
+                padding = [pad] * (max_len - l)
+                go_x.append([go] + sent_id + padding)
+                x_eos.append(sent_id + [eos] + padding)
+                weights.append([1.0] * (l + 1) + [0.0] * (max_len - l))
+            batches.append({'inputs': go_x, 'targets': x_eos, 'weights': weights, 'size': t - s})
+            s = t
+        return batches
+    def evaluate(sess, batch_size, vocab, model, x):
+        batches = get_lm_batches(x, vocab.word2id, batch_size)
+        tot_loss, n_words = 0, 0
+        for batch in batches:
+            tot_loss += sess.run(model.tot_loss, feed_dict={model.batch_size: batch['size'], model.inputs: batch['inputs'],
+                                                            model.targets: batch['targets'],
+                                                            model.weights: batch['weights'], model.dropout: 1})
+            n_words += np.sum(batch['weights'])
+        return np.exp(tot_loss / n_words)
+    class LanguageModel:
+        def __init__(self, batch_size=100):
+            config = tf.ConfigProto()
+            config.gpu_options.allow_growth = True
+            self.batch_size = batch_size
+            self.vocab = Vocabulary('./regina_files/language_data/yelp.vocab', 100)
+            print('vocabulary size', self.vocab.size)
+            g_language = tf.Graph()
+            with g_language.as_default():
+                model = Model(self.vocab)
+                self.sess = tf.Session(config=config, graph=g_language)
+                self.model = load_model(self.sess, model)
+        def test(self, sentences):
+            ppl = evaluate(self.sess, self.batch_size, self.vocab, self.model, sentences)
+            print('dev perplexity %.2f' % ppl)
+            return ppl

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

creating 3 different graphs #28

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

creating 3 different graphs #28

Are you sure you want to change the base?

Uh oh!

creating 3 different graphs #28

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!