From 70a8e88a423e9772f948f616156e67a030fa7b0c Mon Sep 17 00:00:00 2001 From: owendall Date: Sun, 21 Aug 2016 09:05:06 -0400 Subject: [PATCH 1/6] First chamgelog entry --- CHANGELOG.md | 6 ++++++ README.md | 12 +++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..f646cfe --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,6 @@ +# Changelog +## 1.0.1 + +2016-08-20 +- Forked master branch from mimno/jsLDA in order to offer contributions to the Topic Modeling community. +- Version 1.0.1 provides bug fixes as per the Semantic Versioning 2.0.0 standard http://semver.org/ diff --git a/README.md b/README.md index ed1e61c..eefc24e 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,11 @@ -jsLDA -===== +jsLDA-AI +======== +This is a fork of jsLDA by David Mimno with enhancements by Agile Innovations, LLC team. + +2016-08-20 +- Forked by Owen Dall +- Created CHANGELOG.md to follow standards suggested by https://github.com/olivierlacan/keep-a-changelog +- An implementation of latent Dirichlet allocation in javascript. A [live demonstration](http://mimno.infosci.cornell.edu/jsLDA/jslda.html) is available. @@ -42,4 +48,4 @@ Download results: ---------------- You can create reports about your topic model. Hit the `Downloads` tab. -Reports are in CSV format. The `sampling state` file contains the same information as a Mallet state file, but in a more compact format. +Reports are in CSV format. The `sampling state` file contains the same information as a Mallet state file, but in a more compact format. From 3310f4b01b51d7d9f99fdd52de04bc17818614bc Mon Sep 17 00:00:00 2001 From: owendall Date: Sun, 21 Aug 2016 14:18:30 -0400 Subject: [PATCH 2/6] Minor enhancements to download file labels --- bank-vs-bank.txt | 5 ++ exports/doctopics.csv | 6 ++ exports/gephi.csv | 9 +++ exports/keys.csv | 3 + exports/state.csv | 36 ++++++++++ exports/topictopic.csv | 2 + exports/topicwords.csv | 6 ++ jslda.js | 150 ++++++++++++++++++++++------------------- state.csv | 35 ++++++++++ topictopic.csv | 2 + topicwords.csv | 6 ++ 11 files changed, 191 insertions(+), 69 deletions(-) create mode 100644 bank-vs-bank.txt create mode 100644 exports/doctopics.csv create mode 100644 exports/gephi.csv create mode 100644 exports/keys.csv create mode 100644 exports/state.csv create mode 100644 exports/topictopic.csv create mode 100644 exports/topicwords.csv create mode 100644 state.csv create mode 100644 topictopic.csv create mode 100644 topicwords.csv diff --git a/bank-vs-bank.txt b/bank-vs-bank.txt new file mode 100644 index 0000000..0efec71 --- /dev/null +++ b/bank-vs-bank.txt @@ -0,0 +1,5 @@ +1 2016 the money from the bank loan. the bank money. The money for the bank loan +2 2016 a money bank. a river bank. a stream of money loan from the bank. +3 2016 the river bank. the stream. the river bank for the river. a stream bank. +4 2016 the money from the bank. bank loan. the bank was a bank. +5 2016 the river bank. the river was a river with a stream. diff --git a/exports/doctopics.csv b/exports/doctopics.csv new file mode 100644 index 0000000..2b9d0aa --- /dev/null +++ b/exports/doctopics.csv @@ -0,0 +1,6 @@ +Doc,Topic-0,Topic-1 +0,0,0.53333333 +1,0.18181818,0.54545455 +2,0.38461538,0.23076923 +3,0,0.54545455 +4,0.44444444,0.11111111 diff --git a/exports/gephi.csv b/exports/gephi.csv new file mode 100644 index 0000000..f3f3453 --- /dev/null +++ b/exports/gephi.csv @@ -0,0 +1,9 @@ +Source,Target,Weight,Type +0,1,0.53333333,undirected +1,0,0.18181818,undirected +1,1,0.54545455,undirected +2,0,0.38461538,undirected +2,1,0.23076923,undirected +3,1,0.54545455,undirected +4,0,0.44444444,undirected +4,1,0.11111111,undirected diff --git a/exports/keys.csv b/exports/keys.csv new file mode 100644 index 0000000..bcd2e5a --- /dev/null +++ b/exports/keys.csv @@ -0,0 +1,3 @@ +Topic,TokenCount,Words +0,11,"river stream money bank loan" +1,24,"bank money loan river stream" diff --git a/exports/state.csv b/exports/state.csv new file mode 100644 index 0000000..573395c --- /dev/null +++ b/exports/state.csv @@ -0,0 +1,36 @@ +DocID,Word,Topic +0,"money",1 +0,"bank",1 +0,"loan",1 +0,"bank",1 +0,"money",1 +0,"money",1 +0,"bank",1 +0,"loan",1 +1,"money",1 +1,"bank",1 +1,"river",0 +1,"bank",1 +1,"stream",0 +1,"money",1 +1,"loan",1 +1,"bank",1 +2,"river",0 +2,"bank",1 +2,"stream",0 +2,"river",0 +2,"bank",1 +2,"river",0 +2,"stream",0 +2,"bank",1 +3,"money",1 +3,"bank",1 +3,"bank",1 +3,"loan",1 +3,"bank",1 +3,"bank",1 +4,"river",0 +4,"bank",1 +4,"river",0 +4,"river",0 +4,"stream",0 diff --git a/exports/topictopic.csv b/exports/topictopic.csv new file mode 100644 index 0000000..510f6e1 --- /dev/null +++ b/exports/topictopic.csv @@ -0,0 +1,2 @@ +0,-0.18232156 +-0.18232156,0 diff --git a/exports/topicwords.csv b/exports/topicwords.csv new file mode 100644 index 0000000..df6e5a5 --- /dev/null +++ b/exports/topicwords.csv @@ -0,0 +1,6 @@ +word,Topic-0,Topic-1 +money,0,0.25 +bank,0,0.58333333 +loan,0,0.16666667 +river,0.63636364,0 +stream,0.36363636,0 diff --git a/jslda.js b/jslda.js index 5ce1f88..13a2907 100644 --- a/jslda.js +++ b/jslda.js @@ -1,7 +1,7 @@ /** This function is copied from stack overflow: http://stackoverflow.com/users/19068/quentin */ var QueryString = function () { - // This function is anonymous, is executed immediately and + // This function is anonymous, is executed immediately and // the return value is assigned to QueryString! var query_string = {}; var query = window.location.search.substring(1); @@ -19,7 +19,7 @@ var QueryString = function () { } else { query_string[pair[0]].push(pair[1]); } - } + } return query_string; } (); @@ -31,6 +31,14 @@ var stopwordsURL = QueryString.stoplist ? QueryString.stoplist : "stoplist.txt"; documentsURL = decodeURIComponent(documentsURL); stopwordsURL = decodeURIComponent(stopwordsURL); +/********************************************************************************** + 2016-08-20 Owen Dall: Make record delimiter configurable +***********************************************************************************/ +// Unix/Mac OS X => LF => "\n"; Windows => CRLF => "\r\n" +var recordDelimiter = "\n"; +// Revisit placing this option in the UI +/**********************************************************************************/ + function zeros(n) { var x = new Array(n); for (var i = 0; i < n; i++) { x[i] = 0.0; } @@ -75,7 +83,7 @@ d3.select("#num-topics-input").attr("value", numTopics); var stopwords = {}; // ["the", "and", "of", "for", "in", "a", "on", "is", "an", "this", "to", "by", "abstract", "paper", "based", "with", "or", "are", "from", "upon", "we", "us", "our", "can", "be", "using", "which", "that", "d", "n", "as", "it", "show", "these", "such", "s", "t", "i", "j", "have", "one", "new", "one", "has", "learning", "model", "data", "models", "two", "used", "results"].forEach( function(d) { stopwords[d] = 1; } ); -// Use a more agressive smoothing parameter to sort +// Use a more agressive smoothing parameter to sort // documents by topic. This has the effect of preferring // longer documents. var docSortSmoothing = 10.0; @@ -133,9 +141,9 @@ function parseLine ( line ) { rawTokens.forEach(function (word) { if (word !== "") { var topic = Math.floor(Math.random() * numTopics); - + if (word.length <= 2) { stopwords[word] = 1; } - + var isStopword = stopwords[word]; if (isStopword) { // Record counts for stopwords, but nothing else @@ -174,7 +182,7 @@ function addStop(word) { stopwords[word] = 1; vocabularySize--; delete wordTopicCounts[word]; - + documents.forEach( function( currentDoc, i ) { var docTopicCounts = currentDoc.topicCounts; for (var position = 0; position < currentDoc.tokens.length; position++) { @@ -186,7 +194,7 @@ function addStop(word) { } } }); - + sortTopicWords(); displayTopicWords(); reorderDocuments(); @@ -198,7 +206,7 @@ function removeStop(word) { vocabularySize++; wordTopicCounts[word] = {}; var currentWordTopicCounts = wordTopicCounts[ word ]; - + documents.forEach( function( currentDoc, i ) { var docTopicCounts = currentDoc.topicCounts; for (var position = 0; position < currentDoc.tokens.length; position++) { @@ -216,7 +224,7 @@ function removeStop(word) { } } }); - + sortTopicWords(); displayTopicWords(); reorderDocuments(); @@ -225,7 +233,7 @@ function removeStop(word) { function sweep() { var startTime = Date.now(); - + var topicNormalizers = zeros(numTopics); for (var topic = 0; topic < numTopics; topic++) { topicNormalizers[topic] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[topic]); @@ -234,7 +242,7 @@ function sweep() { for (var doc = 0; doc < documents.length; doc++) { var currentDoc = documents[doc]; var docTopicCounts = currentDoc.topicCounts; - + for (var position = 0; position < currentDoc.tokens.length; position++) { var token = currentDoc.tokens[position]; if (token.isStopword) { continue; } @@ -264,7 +272,7 @@ function sweep() { } sum += topicWeights[topic]; } - + // Sample from an unnormalized discrete distribution var sample = sum * Math.random(); var i = 0; @@ -274,7 +282,7 @@ function sweep() { sample -= topicWeights[i]; } token.topic = i; - + tokensPerTopic[ token.topic ]++; if (! currentWordTopicCounts[ token.topic ]) { currentWordTopicCounts[ token.topic ] = 1; @@ -283,16 +291,16 @@ function sweep() { currentWordTopicCounts[ token.topic ] += 1; } docTopicCounts[ token.topic ]++; - + topicNormalizers[ token.topic ] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[ token.topic ]); } } - + //console.log("sweep in " + (Date.now() - startTime) + " ms"); completeSweeps += 1; d3.select("#iters").text(completeSweeps); if (completeSweeps >= requestedSweeps) { - reorderDocuments(); + reorderDocuments(); sortTopicWords(); displayTopicWords(); plotMatrix(); @@ -322,7 +330,7 @@ function sortTopicWords() { for (var topic = 0; topic < numTopics; topic++) { topicWordCounts[topic].sort(byCountDescending); - } + } } function displayTopicWords() { @@ -347,12 +355,12 @@ function displayTopicWords() { function reorderDocuments() { var format = d3.format(".2g"); - + if (selectedTopic === -1) { documents.sort(function(a, b) { return d3.ascending(a.originalOrder, b.originalOrder); }); d3.selectAll("div.document").data(documents) .style("display", "block") - .text(function(d) { return "[" + d.id + "] " + truncate(d.originalText); }); + .text(function(d) { return "[" + d.id + "] " + truncate(d.originalText); }); } else { var scores = documents.map(function (doc, i) { @@ -361,7 +369,7 @@ function reorderDocuments() { scores.sort(function(a, b) { return b.score - a.score; }); - /*documents.sort(function(a, b) { + /*documents.sort(function(a, b) { var score1 = (a.topicCounts[selectedTopic] + docSortSmoothing) / (a.tokens.length + sumDocSortSmoothing); var score2 = (b.topicCounts[selectedTopic] + docSortSmoothing) / (b.tokens.length + sumDocSortSmoothing); return d3.descending(score1, score2); @@ -381,33 +389,33 @@ var topicTimeGroups = new Array(); function createTimeSVGs () { var tsPage = d3.select("#ts-page"); var tsSVG = tsPage.append("svg").attr("height", timeSeriesHeight * numTopics).attr("width", timeSeriesWidth); - + for (var topic = 0; topic < numTopics; topic++) { topicTimeGroups.push(tsSVG.append("g").attr("transform", "translate(0," + (timeSeriesHeight * topic) + ")")); topicTimeGroups[topic].append("path").style("fill", "#ccc"); topicTimeGroups[topic].append("text").attr("y", 40); } - + } function timeSeries() { var tsPage = d3.select("#ts-page"); - + for (var topic = 0; topic < numTopics; topic++) { var topicProportions = documents.map(function (d) { return {date: d.date, p: d.topicCounts[topic] / d.tokens.length}; }) var topicMeans = d3.nest().key(function (d) {return d.date; }).rollup(function (d) {return d3.mean(d, function (x) {return x.p}); }).entries(topicProportions); - + var xScale = d3.scale.linear().domain([0, topicMeans.length]).range([0, timeSeriesWidth]); var yScale = d3.scale.linear().domain([0, 0.2]).range([timeSeriesHeight, 0]); var area = d3.svg.area() .x(function (d, i) { return xScale(i); }) .y(function (d) { return yScale(d.values); }) .y0(yScale(0)); - + topicTimeGroups[topic].select("path").attr("d", area(topicMeans)); topicTimeGroups[topic].select("text").text(topNWords(topicWordCounts[topic], 3)) } - + } // @@ -417,7 +425,7 @@ function timeSeries() { /* This function will compute pairwise correlations between topics. * Unlike the correlated topic model (CTM) LDA doesn't have parameters * that represent topic correlations. But that doesn't mean that topics are - * not correlated, it just means we have to estimate those values by + * not correlated, it just means we have to estimate those values by * measuring which topics appear in documents together. */ function getTopicCorrelations() { @@ -439,7 +447,7 @@ function getTopicCorrelations() { var tokenCutoff = Math.max(correlationMinTokens, correlationMinProportion * d.tokens.length); for (var topic = 0; topic < numTopics; topic++) { - if (d.topicCounts[topic] >= tokenCutoff) { + if (d.topicCounts[topic] >= tokenCutoff) { documentTopics.push(topic); topicProbabilities[topic]++; // Count the number of docs with this topic } @@ -485,19 +493,19 @@ function plotMatrix() { var right = 500; var top = 50; var bottom = 500; - + var correlationMatrix = getTopicCorrelations(); var correlationGraph = getCorrelationGraph(correlationMatrix, -100.0); - + var topicScale = d3.scale.ordinal().domain(d3.range(numTopics)).rangePoints([left, right]); var radiusScale = d3.scale.sqrt().domain([0, 1.0]).range([0, 450 / (2 * numTopics)]); - + var horizontalTopics = vis.selectAll("text.hor").data(correlationGraph.nodes); horizontalTopics.enter().append("text") .attr("class", "hor") .attr("x", right + 10) .attr("y", function(node) { return topicScale(node.name); }); - + horizontalTopics .text(function(node) { return node.words; }); @@ -510,7 +518,7 @@ function plotMatrix() { verticalTopics .text(function(node) { return node.words; }); - + var circles = vis.selectAll("circle").data(correlationGraph.links); circles.enter().append("circle"); @@ -535,7 +543,7 @@ function toggleTopicDocuments(topic) { // unselect the topic d3.selectAll("div.topicwords").attr("class", "topicwords"); selectedTopic = -1; - + sortVocabByTopic = false; d3.select("#sortVocabByTopic").text("Sort by topic") } @@ -555,15 +563,15 @@ function mostFrequentWords(includeStops, sortByTopic) { // Convert the random-access map to a list of word:count pairs that // we can then sort. var wordCounts = []; - + if (sortByTopic) { for (var word in vocabularyCounts) { - if (wordTopicCounts[word] && + if (wordTopicCounts[word] && wordTopicCounts[word][selectedTopic]) { wordCounts.push({"word":word, "count":wordTopicCounts[word][selectedTopic]}); } - } + } } else { for (var word in vocabularyCounts) { @@ -571,7 +579,7 @@ function mostFrequentWords(includeStops, sortByTopic) { wordCounts.push({"word":word, "count":vocabularyCounts[word]}); } - } + } } wordCounts.sort(byCountDescending); @@ -593,7 +601,7 @@ function vocabTable() { var wordFrequencies = mostFrequentWords(displayingStopwords, sortVocabByTopic).slice(0, 499); var table = d3.select("#vocab-table tbody"); table.selectAll("tr").remove(); - + wordFrequencies.forEach(function (d) { var isStopword = stopwords[d.word]; var score = specificity(d.word); @@ -654,7 +662,7 @@ d3.select("#showStops").on("click", function () { } else { displayingStopwords = true; - this.innerText = "Hide stopwords"; + this.innerText = "Hide stopwords"; vocabTable(); } }); @@ -666,85 +674,90 @@ d3.select("#sortVocabByTopic").on("click", function () { } else { sortVocabByTopic = true; - this.innerText = "Sort by frequency"; + this.innerText = "Sort by frequency"; vocabTable(); } }); -// +// // Functions for download links // function saveDocTopics() { - var docTopicsCSV = ""; + // 2016-08-20 Owen Dall: Added header row for easier reading + var docTopicsCSV = "Doc," + d3.range(0, numTopics).map(function(t) {return "Topic-" + t; } ).join(",") + recordDelimiter; var topicProbabilities = zeros(numTopics); - + documents.forEach(function(d, i) { - docTopicsCSV += d.id + "," + d.topicCounts.map(function (x) { return d3.round(x / d.tokens.length, 8); }).join(",") + "\n"; - }); - + docTopicsCSV += d.id + "," + d.topicCounts.map(function (x) { return d3.round(x / d.tokens.length, 8); }).join(",") + recordDelimiter; + }); + d3.select("#doctopics-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(docTopicsCSV)); } function saveTopicWords() { - var topicWordsCSV = "word," + d3.range(0, numTopics).map(function(t) {return "topic" + t; } ).join(",") + "\n"; + // 2016-08-20 Owen Dall: Minor edits to header row + var topicWordsCSV = "Word," + d3.range(0, numTopics).map(function(t) {return "Topic-" + t; } ).join(",") + recordDelimiter; for (var word in wordTopicCounts) { var topicProbabilities = zeros(numTopics); for (var topic in wordTopicCounts[word]) { topicProbabilities[topic] = d3.round(wordTopicCounts[word][topic] / tokensPerTopic[topic], 8); } - topicWordsCSV += word + "," + topicProbabilities.join(",") + "\n"; + topicWordsCSV += word + "," + topicProbabilities.join(",") + recordDelimiter; } d3.select("#topicwords-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(topicWordsCSV)); } function saveTopicKeys() { - var keysCSV = "Topic,TokenCount,Words\n"; - + var keysCSV = "Topic,Token Count,Words" + recordDelimiter; // 2016-08-20 Owen Dall: Minor edit + if (topicWordCounts.length == 0) { sortTopicWords(); } for (var topic = 0; topic < numTopics; topic++) { - keysCSV += topic + "," + tokensPerTopic[topic] + ",\"" + topNWords(topicWordCounts[topic], 10) + "\"\n"; + keysCSV += topic + "," + tokensPerTopic[topic] + ",\"" + topNWords(topicWordCounts[topic], 10) + "\""+ recordDelimiter; } - + d3.select("#keys-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(keysCSV)); } function saveTopicPMI() { - var pmiCSV = ""; - var matrix = getTopicCorrelations(); - - matrix.forEach(function(row) { pmiCSV += row.map(function (x) { return d3.round(x, 8); }).join(",") + "\n"; }); - - d3.select("#topictopic-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(pmiCSV)); + // 2016-08-21 Owen Dall: Added Row and Column Labels for easy viewing. + // Put column header for each topic, starting at column 2: + var pmiCSV = ","+ d3.range(0, numTopics).map(function(t) {return "Topic-" + t; } ).join(",") + recordDelimiter; + // + var matrix = getTopicCorrelations(); + // Now add the topic number as first column for each row (pass "i" into the fuction) + matrix.forEach(function(row,i) { pmiCSV += "Topic-"+ i +"," + row.map(function (x,i) { return d3.round(x, 8); }).join(",") + recordDelimiter; }); + // + d3.select("#topictopic-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(pmiCSV)); } function saveGraph() { - var graphCSV = "Source,Target,Weight,Type\n"; + var graphCSV = "Source,Target,Weight,Type" + recordDelimiter; var topicProbabilities = zeros(numTopics); - + documents.forEach(function(d, i) { d.topicCounts.forEach(function(x, topic) { if (x > 0.0) { graphCSV += d.id + "," + topic + "," + d3.round(x / d.tokens.length, 8) + ",undirected\n"; } }); - }); - + }); + d3.select("#graph-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(graphCSV)); } function saveState() { - var state = "DocID,Word,Topic"; + var state = "DocID,Word,Topic" + recordDelimiter; // 2016-08-20 Owen Dall: Need record delimiter after header row documents.forEach(function(d, docID) { d.tokens.forEach(function(token, position) { if (! token.isStopword) { - state += docID + ",\"" + token.word + "\"," + token.topic + "\n"; + state += docID + ",\"" + token.word + "\"," + token.topic + recordDelimiter; } }); }); - + d3.select("#state-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(state)); } @@ -759,10 +772,10 @@ function ready(error, stops, lines) { if (error) { alert("One of these URLs didn't work:\n " + stopwordsURL + "\n " + documentsURL); } else { // Create the stoplist - stops.split("\n").forEach(function (w) { stopwords[w] = 1; }); + stops.split(recordDelimiter).forEach(function (w) { stopwords[w] = 1; }); // Load documents and populate the vocabulary - lines.split("\n").forEach(parseLine); + lines.split(recordDelimiter).forEach(parseLine); sortTopicWords(); displayTopicWords(); @@ -775,4 +788,3 @@ function ready(error, stops, lines) { timeSeries(); } } - diff --git a/state.csv b/state.csv new file mode 100644 index 0000000..ae190ba --- /dev/null +++ b/state.csv @@ -0,0 +1,35 @@ +DocID,Word,Topic0,"money",1 +0,"bank",1 +0,"loan",1 +0,"bank",1 +0,"money",1 +0,"money",1 +0,"bank",1 +0,"loan",1 +1,"money",1 +1,"bank",1 +1,"river",0 +1,"bank",1 +1,"stream",0 +1,"money",1 +1,"loan",1 +1,"bank",0 +2,"river",0 +2,"bank",0 +2,"stream",0 +2,"river",0 +2,"bank",0 +2,"river",0 +2,"stream",0 +2,"bank",0 +3,"money",1 +3,"bank",1 +3,"bank",1 +3,"loan",1 +3,"bank",1 +3,"bank",1 +4,"river",0 +4,"bank",0 +4,"river",0 +4,"river",0 +4,"stream",0 diff --git a/topictopic.csv b/topictopic.csv new file mode 100644 index 0000000..b9318c5 --- /dev/null +++ b/topictopic.csv @@ -0,0 +1,2 @@ +0,-0.58778666 +-0.58778666,0 diff --git a/topicwords.csv b/topicwords.csv new file mode 100644 index 0000000..e33e9de --- /dev/null +++ b/topicwords.csv @@ -0,0 +1,6 @@ +word,topic0,topic1 +money,0,0.31578947 +bank,0.3125,0.47368421 +loan,0,0.21052632 +river,0.4375,0 +stream,0.25,0 From 1ddd957e8a2db9092e3e99dcef50a298fa8ae443 Mon Sep 17 00:00:00 2001 From: owendall Date: Sun, 21 Aug 2016 14:25:39 -0400 Subject: [PATCH 3/6] Updated CHANGELOG.md --- CHANGELOG.md | 8 +++++++- bank-vs-bank.txt | 5 ----- exports/doctopics.csv | 6 ------ exports/gephi.csv | 9 --------- exports/keys.csv | 3 --- exports/state.csv | 36 ------------------------------------ exports/topictopic.csv | 2 -- exports/topicwords.csv | 6 ------ state.csv | 35 ----------------------------------- topictopic.csv | 2 -- topicwords.csv | 6 ------ 11 files changed, 7 insertions(+), 111 deletions(-) delete mode 100644 bank-vs-bank.txt delete mode 100644 exports/doctopics.csv delete mode 100644 exports/gephi.csv delete mode 100644 exports/keys.csv delete mode 100644 exports/state.csv delete mode 100644 exports/topictopic.csv delete mode 100644 exports/topicwords.csv delete mode 100644 state.csv delete mode 100644 topictopic.csv delete mode 100644 topicwords.csv diff --git a/CHANGELOG.md b/CHANGELOG.md index f646cfe..b315aa9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,12 @@ # Changelog ## 1.0.1 -2016-08-20 +2016-08-21: Owen Dall +- Added global "recordDelimiter" variable to be used throughout instead of "\n". +- Fixed minor issue with "saveState" CSV header not having a record delimiter. +- Added Row and column labels to the topicPMI csv. +- Minor edits row and column labels for the other csv definitions. + +2016-08-20: Owen Dall - Forked master branch from mimno/jsLDA in order to offer contributions to the Topic Modeling community. - Version 1.0.1 provides bug fixes as per the Semantic Versioning 2.0.0 standard http://semver.org/ diff --git a/bank-vs-bank.txt b/bank-vs-bank.txt deleted file mode 100644 index 0efec71..0000000 --- a/bank-vs-bank.txt +++ /dev/null @@ -1,5 +0,0 @@ -1 2016 the money from the bank loan. the bank money. The money for the bank loan -2 2016 a money bank. a river bank. a stream of money loan from the bank. -3 2016 the river bank. the stream. the river bank for the river. a stream bank. -4 2016 the money from the bank. bank loan. the bank was a bank. -5 2016 the river bank. the river was a river with a stream. diff --git a/exports/doctopics.csv b/exports/doctopics.csv deleted file mode 100644 index 2b9d0aa..0000000 --- a/exports/doctopics.csv +++ /dev/null @@ -1,6 +0,0 @@ -Doc,Topic-0,Topic-1 -0,0,0.53333333 -1,0.18181818,0.54545455 -2,0.38461538,0.23076923 -3,0,0.54545455 -4,0.44444444,0.11111111 diff --git a/exports/gephi.csv b/exports/gephi.csv deleted file mode 100644 index f3f3453..0000000 --- a/exports/gephi.csv +++ /dev/null @@ -1,9 +0,0 @@ -Source,Target,Weight,Type -0,1,0.53333333,undirected -1,0,0.18181818,undirected -1,1,0.54545455,undirected -2,0,0.38461538,undirected -2,1,0.23076923,undirected -3,1,0.54545455,undirected -4,0,0.44444444,undirected -4,1,0.11111111,undirected diff --git a/exports/keys.csv b/exports/keys.csv deleted file mode 100644 index bcd2e5a..0000000 --- a/exports/keys.csv +++ /dev/null @@ -1,3 +0,0 @@ -Topic,TokenCount,Words -0,11,"river stream money bank loan" -1,24,"bank money loan river stream" diff --git a/exports/state.csv b/exports/state.csv deleted file mode 100644 index 573395c..0000000 --- a/exports/state.csv +++ /dev/null @@ -1,36 +0,0 @@ -DocID,Word,Topic -0,"money",1 -0,"bank",1 -0,"loan",1 -0,"bank",1 -0,"money",1 -0,"money",1 -0,"bank",1 -0,"loan",1 -1,"money",1 -1,"bank",1 -1,"river",0 -1,"bank",1 -1,"stream",0 -1,"money",1 -1,"loan",1 -1,"bank",1 -2,"river",0 -2,"bank",1 -2,"stream",0 -2,"river",0 -2,"bank",1 -2,"river",0 -2,"stream",0 -2,"bank",1 -3,"money",1 -3,"bank",1 -3,"bank",1 -3,"loan",1 -3,"bank",1 -3,"bank",1 -4,"river",0 -4,"bank",1 -4,"river",0 -4,"river",0 -4,"stream",0 diff --git a/exports/topictopic.csv b/exports/topictopic.csv deleted file mode 100644 index 510f6e1..0000000 --- a/exports/topictopic.csv +++ /dev/null @@ -1,2 +0,0 @@ -0,-0.18232156 --0.18232156,0 diff --git a/exports/topicwords.csv b/exports/topicwords.csv deleted file mode 100644 index df6e5a5..0000000 --- a/exports/topicwords.csv +++ /dev/null @@ -1,6 +0,0 @@ -word,Topic-0,Topic-1 -money,0,0.25 -bank,0,0.58333333 -loan,0,0.16666667 -river,0.63636364,0 -stream,0.36363636,0 diff --git a/state.csv b/state.csv deleted file mode 100644 index ae190ba..0000000 --- a/state.csv +++ /dev/null @@ -1,35 +0,0 @@ -DocID,Word,Topic0,"money",1 -0,"bank",1 -0,"loan",1 -0,"bank",1 -0,"money",1 -0,"money",1 -0,"bank",1 -0,"loan",1 -1,"money",1 -1,"bank",1 -1,"river",0 -1,"bank",1 -1,"stream",0 -1,"money",1 -1,"loan",1 -1,"bank",0 -2,"river",0 -2,"bank",0 -2,"stream",0 -2,"river",0 -2,"bank",0 -2,"river",0 -2,"stream",0 -2,"bank",0 -3,"money",1 -3,"bank",1 -3,"bank",1 -3,"loan",1 -3,"bank",1 -3,"bank",1 -4,"river",0 -4,"bank",0 -4,"river",0 -4,"river",0 -4,"stream",0 diff --git a/topictopic.csv b/topictopic.csv deleted file mode 100644 index b9318c5..0000000 --- a/topictopic.csv +++ /dev/null @@ -1,2 +0,0 @@ -0,-0.58778666 --0.58778666,0 diff --git a/topicwords.csv b/topicwords.csv deleted file mode 100644 index e33e9de..0000000 --- a/topicwords.csv +++ /dev/null @@ -1,6 +0,0 @@ -word,topic0,topic1 -money,0,0.31578947 -bank,0.3125,0.47368421 -loan,0,0.21052632 -river,0.4375,0 -stream,0.25,0 From b4aa380165f173cbc78fb2fbe77b11c55d8b98b0 Mon Sep 17 00:00:00 2001 From: owendall Date: Sun, 21 Aug 2016 17:45:57 -0400 Subject: [PATCH 4/6] Minor edits --- jslda.js | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/jslda.js b/jslda.js index 13a2907..adea759 100644 --- a/jslda.js +++ b/jslda.js @@ -36,6 +36,7 @@ stopwordsURL = decodeURIComponent(stopwordsURL); ***********************************************************************************/ // Unix/Mac OS X => LF => "\n"; Windows => CRLF => "\r\n" var recordDelimiter = "\n"; + // Revisit placing this option in the UI /**********************************************************************************/ @@ -461,6 +462,7 @@ function getTopicCorrelations() { } } }); + for (var t1 = 0; t1 < numTopics - 1; t1++) { for (var t2 = t1 + 1; t2 < numTopics; t2++) { correlationMatrix[t1][t2] = Math.log((documents.length * correlationMatrix[t1][t2]) / @@ -710,7 +712,7 @@ function saveTopicWords() { } function saveTopicKeys() { - var keysCSV = "Topic,Token Count,Words" + recordDelimiter; // 2016-08-20 Owen Dall: Minor edit + var keysCSV = "Topic,Token Count,Top 10 Words " + recordDelimiter; // 2016-08-20 Owen Dall: Minor edits if (topicWordCounts.length == 0) { sortTopicWords(); } @@ -723,6 +725,7 @@ function saveTopicKeys() { function saveTopicPMI() { // 2016-08-21 Owen Dall: Added Row and Column Labels for easy viewing. + // PMI = Pointwise Mutual Information // Put column header for each topic, starting at column 2: var pmiCSV = ","+ d3.range(0, numTopics).map(function(t) {return "Topic-" + t; } ).join(",") + recordDelimiter; // From e3eefc22262366407cf5afcc547583ad5f6ac205 Mon Sep 17 00:00:00 2001 From: owendall Date: Sun, 25 Sep 2016 17:40:15 -0400 Subject: [PATCH 5/6] Fix for document-topic probability martrix --- bank-vs-bank.txt | 10 +++ jslda.js | 15 +++-- sweeps.js | 66 ++++++++++++++++++++ us-constitution-bill-of-rights-formatted.txt | 10 +++ 4 files changed, 97 insertions(+), 4 deletions(-) create mode 100644 bank-vs-bank.txt create mode 100644 sweeps.js create mode 100644 us-constitution-bill-of-rights-formatted.txt diff --git a/bank-vs-bank.txt b/bank-vs-bank.txt new file mode 100644 index 0000000..e19aad8 --- /dev/null +++ b/bank-vs-bank.txt @@ -0,0 +1,10 @@ +I live on the river, at the river bank. +I had a boat on the river. I was on the river every day. The river is great. +A storm pulled the boat from the river bank. The boat sank in the river, far from the bank. +I needed money for a bank loan to get a new boat. I went to the bank. The bank was open. +I told the bank that I need a new boat. The bank told me they would loan me the money. +I called the bank every month about the money for the loan. I need the money. I need the loan. +I miss my boat. I live on the river bank but don't have a boat. +It took a year for the bank to loan me the money for the boat. The bank said I would get the money for the boat +I bought a new boat with the money from the bank. +Now I take the boat out on the river. I added a pier to the river bank for the boat. diff --git a/jslda.js b/jslda.js index adea759..521b922 100644 --- a/jslda.js +++ b/jslda.js @@ -35,9 +35,9 @@ stopwordsURL = decodeURIComponent(stopwordsURL); 2016-08-20 Owen Dall: Make record delimiter configurable ***********************************************************************************/ // Unix/Mac OS X => LF => "\n"; Windows => CRLF => "\r\n" +// Revisit placing this option in the UI var recordDelimiter = "\n"; -// Revisit placing this option in the UI /**********************************************************************************/ function zeros(n) { @@ -202,7 +202,7 @@ function addStop(word) { vocabTable(); } -function removeStop(word) { + function removeStop(word) { delete stopwords[word]; vocabularySize++; wordTopicCounts[word] = {}; @@ -691,7 +691,11 @@ function saveDocTopics() { var topicProbabilities = zeros(numTopics); documents.forEach(function(d, i) { - docTopicsCSV += d.id + "," + d.topicCounts.map(function (x) { return d3.round(x / d.tokens.length, 8); }).join(",") + recordDelimiter; + // 2016-09-25 Owen Dall: Don't use d.tokens.length as it includes stopwords not included in topicCounts + var tokenSum = d3.sum(d.topicCounts); + topicProbabilities = d.topicCounts.map(function (topicTokenCount) { return d3.round(topicTokenCount / tokenSum, 8); }); + docTopicsCSV += d.id + "," + topicProbabilities.join(",") + exportRecordDelimiter; + // docTopicsCSV += d.id + "," + d.topicCounts.map(function (x) { return d3.round(x / d.tokens.length, 8); }).join(",") + recordDelimiter; }); d3.select("#doctopics-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(docTopicsCSV)); @@ -762,6 +766,7 @@ function saveState() { }); d3.select("#state-dl").attr("href", "data:Content-type:text/csv;charset=UTF-8," + encodeURIComponent(state)); + debugger; } @@ -779,7 +784,9 @@ function ready(error, stops, lines) { // Load documents and populate the vocabulary lines.split(recordDelimiter).forEach(parseLine); - + // Temp for debuggging + saveState(); + debugger; sortTopicWords(); displayTopicWords(); toggleTopicDocuments(0); diff --git a/sweeps.js b/sweeps.js new file mode 100644 index 0000000..8b2d241 --- /dev/null +++ b/sweeps.js @@ -0,0 +1,66 @@ +function sweep() { + var startTime = Date.now(); + + var topicNormalizers = zeros(numTopics); + for (var topic = 0; topic < numTopics; topic++) { + topicNormalizers[topic] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[topic]); + } + + for (var doc = 0; doc < documents.length; doc++) { + var currentDoc = documents[doc]; + var docTopicCounts = currentDoc.topicCounts; + + for (var position = 0; position < currentDoc.tokens.length; position++) { + var token = currentDoc.tokens[position]; + if (token.isStopword) { continue; } + + tokensPerTopic[ token.topic ]--; + var currentWordTopicCounts = wordTopicCounts[ token.word ]; + currentWordTopicCounts[ token.topic ]--; + if (currentWordTopicCounts[ token.topic ] == 0) { + //delete(currentWordTopicCounts[ token.topic ]); + } + docTopicCounts[ token.topic ]--; + topicNormalizers[ token.topic ] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[ token.topic ]); + + var sum = 0.0; + for (var topic = 0; topic < numTopics; topic++) { + if (currentWordTopicCounts[ topic ]) { + topicWeights[topic] = + (documentTopicSmoothing + docTopicCounts[topic]) * + (topicWordSmoothing + currentWordTopicCounts[ topic ]) * + topicNormalizers[topic]; + } + else { + topicWeights[topic] = + (documentTopicSmoothing + docTopicCounts[topic]) * + topicWordSmoothing * + topicNormalizers[topic]; + } + sum += topicWeights[topic]; + } + + // Sample from an unnormalized discrete distribution + var sample = sum * Math.random(); + var i = 0; + sample -= topicWeights[i]; + while (sample > 0.0) { + i++; + sample -= topicWeights[i]; + } + token.topic = i; + + tokensPerTopic[ token.topic ]++; + if (! currentWordTopicCounts[ token.topic ]) { + currentWordTopicCounts[ token.topic ] = 1; + } + else { + currentWordTopicCounts[ token.topic ] += 1; + } + docTopicCounts[ token.topic ]++; + + topicNormalizers[ token.topic ] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[ token.topic ]); + } + } + + //console.log("sweep in " + (Date.now() - startTime) + " ms"); \ No newline at end of file diff --git a/us-constitution-bill-of-rights-formatted.txt b/us-constitution-bill-of-rights-formatted.txt new file mode 100644 index 0000000..0226cb8 --- /dev/null +++ b/us-constitution-bill-of-rights-formatted.txt @@ -0,0 +1,10 @@ +Amendment-1 1791 Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof; or abridging the freedom of speech, or of the press; or the right of the people peaceably to assemble, and to petition the Government for a redress of grievances. +Amendment-2 1791 A well regulated Militia, being necessary to the security of a free State, the right of the people to keep and bear Arms, shall not be infringed. +Amendment-3 1791 No Soldier shall, in time of peace be quartered in any house, without the consent of the Owner, nor in time of war, but in a manner to be prescribed by law. +Amendment-4 1791 The right of the people to be secure in their persons, houses, papers, and effects, against unreasonable searches and seizures, shall not be violated, and no Warrants shall issue, but upon probable cause, supported by Oath or affirmation, and particularly describing the place to be searched, and the persons or things to be seized. +Amendment-5 1791 No person shall be held to answer for a capital, or otherwise infamous crime, unless on a presentment or indictment of a Grand Jury, except in cases arising in the land or naval forces, or in the Militia, when in actual service in time of War or public danger; nor shall any person be subject for the same offense to be twice put in jeopardy of life or limb; nor shall be compelled in any criminal case to be a witness against himself, nor be deprived of life, liberty, or property, without due process of law; nor shall private property be taken for public use, without just compensation. +Amendment-6 1791 In all criminal prosecutions, the accused shall enjoy the right to a speedy and public trial, by an impartial jury of the State and district wherein the crime shall have been committed, which district shall have been previously ascertained by law, and to be informed of the nature and cause of the accusation; to be confronted with the witnesses against him; to have compulsory process for obtaining witnesses in his favor, and to have the Assistance of Counsel for his defence. +Amendment-7 1791 In Suits at common law, where the value in controversy shall exceed twenty dollars, the right of trial by jury shall be preserved, and no fact tried by a jury, shall be otherwise re-examined in any Court of the United States, than according to the rules of the common law. +Amendment-8 1791 Excessive bail shall not be required, nor excessive fines imposed, nor cruel and unusual punishments inflicted. +Amendment-9 1791 The enumeration in the Constitution, of certain rights, shall not be construed to deny or disparage others retained by the people. +Amendment-10 1791 The powers not delegated to the United States by the Constitution, nor prohibited by it to the States, are reserved to the States respectively, or to the people. From e54e7b0332da8d6e3e64a72576f84fce2949ce23 Mon Sep 17 00:00:00 2001 From: owendall Date: Sun, 25 Sep 2016 17:42:17 -0400 Subject: [PATCH 6/6] Remove extra files --- bank-vs-bank.txt | 10 --- sweeps.js | 66 -------------------- us-constitution-bill-of-rights-formatted.txt | 10 --- 3 files changed, 86 deletions(-) delete mode 100644 bank-vs-bank.txt delete mode 100644 sweeps.js delete mode 100644 us-constitution-bill-of-rights-formatted.txt diff --git a/bank-vs-bank.txt b/bank-vs-bank.txt deleted file mode 100644 index e19aad8..0000000 --- a/bank-vs-bank.txt +++ /dev/null @@ -1,10 +0,0 @@ -I live on the river, at the river bank. -I had a boat on the river. I was on the river every day. The river is great. -A storm pulled the boat from the river bank. The boat sank in the river, far from the bank. -I needed money for a bank loan to get a new boat. I went to the bank. The bank was open. -I told the bank that I need a new boat. The bank told me they would loan me the money. -I called the bank every month about the money for the loan. I need the money. I need the loan. -I miss my boat. I live on the river bank but don't have a boat. -It took a year for the bank to loan me the money for the boat. The bank said I would get the money for the boat -I bought a new boat with the money from the bank. -Now I take the boat out on the river. I added a pier to the river bank for the boat. diff --git a/sweeps.js b/sweeps.js deleted file mode 100644 index 8b2d241..0000000 --- a/sweeps.js +++ /dev/null @@ -1,66 +0,0 @@ -function sweep() { - var startTime = Date.now(); - - var topicNormalizers = zeros(numTopics); - for (var topic = 0; topic < numTopics; topic++) { - topicNormalizers[topic] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[topic]); - } - - for (var doc = 0; doc < documents.length; doc++) { - var currentDoc = documents[doc]; - var docTopicCounts = currentDoc.topicCounts; - - for (var position = 0; position < currentDoc.tokens.length; position++) { - var token = currentDoc.tokens[position]; - if (token.isStopword) { continue; } - - tokensPerTopic[ token.topic ]--; - var currentWordTopicCounts = wordTopicCounts[ token.word ]; - currentWordTopicCounts[ token.topic ]--; - if (currentWordTopicCounts[ token.topic ] == 0) { - //delete(currentWordTopicCounts[ token.topic ]); - } - docTopicCounts[ token.topic ]--; - topicNormalizers[ token.topic ] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[ token.topic ]); - - var sum = 0.0; - for (var topic = 0; topic < numTopics; topic++) { - if (currentWordTopicCounts[ topic ]) { - topicWeights[topic] = - (documentTopicSmoothing + docTopicCounts[topic]) * - (topicWordSmoothing + currentWordTopicCounts[ topic ]) * - topicNormalizers[topic]; - } - else { - topicWeights[topic] = - (documentTopicSmoothing + docTopicCounts[topic]) * - topicWordSmoothing * - topicNormalizers[topic]; - } - sum += topicWeights[topic]; - } - - // Sample from an unnormalized discrete distribution - var sample = sum * Math.random(); - var i = 0; - sample -= topicWeights[i]; - while (sample > 0.0) { - i++; - sample -= topicWeights[i]; - } - token.topic = i; - - tokensPerTopic[ token.topic ]++; - if (! currentWordTopicCounts[ token.topic ]) { - currentWordTopicCounts[ token.topic ] = 1; - } - else { - currentWordTopicCounts[ token.topic ] += 1; - } - docTopicCounts[ token.topic ]++; - - topicNormalizers[ token.topic ] = 1.0 / (vocabularySize * topicWordSmoothing + tokensPerTopic[ token.topic ]); - } - } - - //console.log("sweep in " + (Date.now() - startTime) + " ms"); \ No newline at end of file diff --git a/us-constitution-bill-of-rights-formatted.txt b/us-constitution-bill-of-rights-formatted.txt deleted file mode 100644 index 0226cb8..0000000 --- a/us-constitution-bill-of-rights-formatted.txt +++ /dev/null @@ -1,10 +0,0 @@ -Amendment-1 1791 Congress shall make no law respecting an establishment of religion, or prohibiting the free exercise thereof; or abridging the freedom of speech, or of the press; or the right of the people peaceably to assemble, and to petition the Government for a redress of grievances. -Amendment-2 1791 A well regulated Militia, being necessary to the security of a free State, the right of the people to keep and bear Arms, shall not be infringed. -Amendment-3 1791 No Soldier shall, in time of peace be quartered in any house, without the consent of the Owner, nor in time of war, but in a manner to be prescribed by law. -Amendment-4 1791 The right of the people to be secure in their persons, houses, papers, and effects, against unreasonable searches and seizures, shall not be violated, and no Warrants shall issue, but upon probable cause, supported by Oath or affirmation, and particularly describing the place to be searched, and the persons or things to be seized. -Amendment-5 1791 No person shall be held to answer for a capital, or otherwise infamous crime, unless on a presentment or indictment of a Grand Jury, except in cases arising in the land or naval forces, or in the Militia, when in actual service in time of War or public danger; nor shall any person be subject for the same offense to be twice put in jeopardy of life or limb; nor shall be compelled in any criminal case to be a witness against himself, nor be deprived of life, liberty, or property, without due process of law; nor shall private property be taken for public use, without just compensation. -Amendment-6 1791 In all criminal prosecutions, the accused shall enjoy the right to a speedy and public trial, by an impartial jury of the State and district wherein the crime shall have been committed, which district shall have been previously ascertained by law, and to be informed of the nature and cause of the accusation; to be confronted with the witnesses against him; to have compulsory process for obtaining witnesses in his favor, and to have the Assistance of Counsel for his defence. -Amendment-7 1791 In Suits at common law, where the value in controversy shall exceed twenty dollars, the right of trial by jury shall be preserved, and no fact tried by a jury, shall be otherwise re-examined in any Court of the United States, than according to the rules of the common law. -Amendment-8 1791 Excessive bail shall not be required, nor excessive fines imposed, nor cruel and unusual punishments inflicted. -Amendment-9 1791 The enumeration in the Constitution, of certain rights, shall not be construed to deny or disparage others retained by the people. -Amendment-10 1791 The powers not delegated to the United States by the Constitution, nor prohibited by it to the States, are reserved to the States respectively, or to the people.