diff --git a/@client/cluster.coffee b/@client/cluster.coffee new file mode 100644 index 000000000..8bade9584 --- /dev/null +++ b/@client/cluster.coffee @@ -0,0 +1,126 @@ + +class CosineSimilarityAccumulator + constructor: -> + @sumSq1 = 0 + @sumSq2 = 0 + @sumProducts = 0 + + increment: ( value1, value2 ) -> + value1 ?= 0 + value2 ?= 0 + @sumSq1 += (value1 * value1) + @sumSq2 += (value2 * value2) + @sumProducts += (value1 * value2) + + current: -> + if ((0 < @sumSq1) and (0 < @sumSq2)) then @sumProducts / Math.sqrt( @sumSq1 * @sumSq2 ) else 0 + + +# Use cosine-similarity because it is scale-invariant, but still anchored around zero. +# Covariance is translation-invariant, not anchored. Euclidean-distance is scale sensitive. +cosineSimilarity = ( nameToNumber1, nameToNumber2 ) -> + similarity = new CosineSimilarityAccumulator() + # For each feature that exists in nameToNumber1... increment similarity + for name,value1 of nameToNumber1 + value2 = nameToNumber2[ name ] ? 0 + similarity.increment( value1, value2 ) + # For each feature that exists only in nameToNumber2... increment similarity + for name,value2 of nameToNumber2 + if name not of nameToNumber1 + similarity.increment( 0, value2 ) + similarity.current() + + +weightedAverage = ( nameToNumber1, weight1, nameToNumber2, weight2 ) -> + nameToAverage = { } # map[ name -> weighted average of feature ] + sumWeights = weight1 + weight2 + if sumWeights == 0 + return nameToAverage + # For each feature that exists in nameToNumber1... set average + for name,value1 of nameToNumber1 + value2 = nameToNumber2[ name ] ? 0 + nameToAverage[ name ] = if (sumWeights == 0) then 0 else ( (value1 * weight1) + (value2 * weight2) ) / sumWeights + # For each feature that exists only in nameToNumber2... set average + for name,value2 of nameToNumber2 + if name not of nameToNumber1 + nameToAverage[ name ] = if (sumWeights == 0) then 0 else (value2 * weight2) / sumWeights + nameToAverage + + +# Automatically cluster users based on similarity of their opinions across all opinions +cluster = ( clusters ) -> + # Halt based on minimum-cluster-coverage and minimum-similarity. Enforce maximum-number-of-groups. + MIN_CLUSTERED_USERS_FRAC = 0.50 + MAX_GROUP_USERS_FRAC = 0.30 + MIN_SIMILARITY = 0.50 + MAX_CLUSTERS = 5 + MAX_LOOPS = Math.floor( clusters.length * MIN_CLUSTERED_USERS_FRAC ) + # Cluster bottom-up, merging the most similar pair of users/clusters + numUsers = clusters.length + loopNum = 0 + # Loop until users condense to a few dense clusters, and some leftover unclustered users... + while true + ++loopNum + if MAX_LOOPS < loopNum + break + + # Enforce halting before finding candidate merge + numUsersUnclustered = clusters.filter( (c) -> (c.userIds.length == 1) ).length + numClusters = clusters.length - numUsersUnclustered + clusteredUsersFrac = ( numUsers - numUsersUnclustered ) / numUsers + suffientClusteringDone = (numClusters <= MAX_CLUSTERS) and (MIN_CLUSTERED_USERS_FRAC <= clusteredUsersFrac) + + # Find most similar cluster pair + maxSimilarity = Number.NEGATIVE_INFINITY + maxSimilarityPair = null + # For each unique pair of cluster1 x cluster2... + for cluster1, index1 in clusters + for index2 in [ index1+1 ... clusters.length ] + cluster2 = clusters[ index2 ] + similarity = cosineSimilarity( cluster1.center, cluster2.center ) + groupFrac = ( cluster1.userIds.length + cluster2.userIds.length ) / numUsers + if ( maxSimilarity < similarity ) and ( groupFrac < MAX_GROUP_USERS_FRAC ) + maxSimilarity = similarity + maxSimilarityPair = [ cluster1, cluster2 ] + + # Merge most similar cluster pair + if not maxSimilarityPair then break + newCluster = { + userIds: maxSimilarityPair[0].userIds.concat( maxSimilarityPair[1].userIds ) + center: weightedAverage( maxSimilarityPair[0].center, maxSimilarityPair[0].userIds.length, maxSimilarityPair[1].center, maxSimilarityPair[1].userIds.length ) + } + + # Replace most similar pair with new cluster + newClusters = clusters.filter( (i) -> (i != maxSimilarityPair[0]) and (i != maxSimilarityPair[1]) ) + newClusters.push( newCluster ) + + # Enforce halting conditions on new candidate clusters + if suffientClusteringDone and (maxSimilarity < MIN_SIMILARITY) + break + + # Update clusters + clusters = newClusters + + clusters + + + +DedicatedWorkerGlobalScope.clusterAndMap = ( userXProposalToOpinion ) -> + # Put all users into separate clusters + initialClusters = [ ] + for user, userProposalToOpinion of userXProposalToOpinion + initialClusters.push( { userIds:[user], center:userProposalToOpinion } ) + + # Group users + clusters = cluster( initialClusters ) + + # Collect map[ userId -> clusterId ] + userIdToClusterId = { } + for cluster, clusterIndex in clusters + for userId, userIndex in cluster.userIds + clusterId = if ( cluster.userIds.length == 1 ) then 'None' else String( clusterIndex ) + userIdToClusterId[ userId ] = clusterId + + userIdToClusterId + + diff --git a/@client/opinion_views.coffee b/@client/opinion_views.coffee index 17af3dcb7..d25385d5e 100644 --- a/@client/opinion_views.coffee +++ b/@client/opinion_views.coffee @@ -86,6 +86,10 @@ window.get_opinions_for_proposal = (opinions, proposal, weights) -> (o for o in opinions when weights[o.user] > 0) + +CLUSTERS_CACHE_KEY = 'clusters' +userIdToClusterId = null + # Attributes are data about participants that can serve as additional filters. # We take them from legacy opinion views, and from # qualifying user_tags @@ -144,9 +148,53 @@ window.get_participant_attributes = -> input_type: tag.self_report?.input continuous_value: tag.continuous_value + # Cluster results will be available in cache, but only on later calls to get_participant_attributes() + userIdToClusterId = fetch( CLUSTERS_CACHE_KEY )?.userIdToClusterId + if userIdToClusterId + attributes.push + key: 'clusters' + name: 'User Clusters' + pass: (user, value) -> userIdToClusterId[ user ] ? 'None' + options: Array.from( new Set( Object.values(userIdToClusterId).concat('None') ) ) + continuous_value: false + else + # Start clustering in a worker thread + initiateClustering() + attributes +clusterWorker = null + +initiateClustering = ( ) -> + # Only allow one cluster worker at a time + if clusterWorker + return + + # Collect all user x proposal opinions + proposals = fetch('/proposals').proposals + userXProposalToOpinion = { } + for proposal, proposalIndex in proposals + for opinion, opinionIndex in proposal.opinions + userXProposalToOpinion[ opinion.user ] ?= { } + userXProposalToOpinion[ opinion.user ][ proposal.key ] = opinion.stance + parameters = { task:'cluster', userXProposalToOpinion:userXProposalToOpinion } + console.debug( 'initiateClustering() number of users =', Object.keys(userXProposalToOpinion).length ) + + # Start clustering on worker thread + clusterWorker = new Worker( '/build/web_worker.js' ) + clusterWorker.postMessage( parameters ) + clusterWorker.onmessage = ( result ) -> + clusterWorker = null + if (not result) or (not result.data) + return + console.debug( 'initiateClustering() clusterWorker.onmessage() number of users =', Object.keys(result.data.userIdToClusterId).length ) + # Update user attributes and custom filtering display + # Do not save to server -- not implemented, and dont want to use that much server storage per user + clustersRecord = fetch( CLUSTERS_CACHE_KEY ) ? { key:CLUSTERS_CACHE_KEY } + clustersRecord.userIdToClusterId = result.data.userIdToClusterId + save( clustersRecord ) + window.get_user_groups_from_views = (groups) -> diff --git a/@client/web_worker.coffee b/@client/web_worker.coffee index 47775bfba..9395c8027 100644 --- a/@client/web_worker.coffee +++ b/@client/web_worker.coffee @@ -1,4 +1,6 @@ require './histogram_layout' +require './cluster' + addEventListener 'message', (ev) -> msg = ev.data @@ -6,3 +8,7 @@ addEventListener 'message', (ev) -> if msg.task == 'layoutAvatars' DedicatedWorkerGlobalScope.enqueue_histo_layout msg + else if msg.task == 'cluster' + userIdToClusterId = DedicatedWorkerGlobalScope.clusterAndMap( msg.userXProposalToOpinion ) + postMessage( {result:'clustered', userIdToClusterId:userIdToClusterId} ) +