From 21937597f14b48b88b8a67a18bd454c55ef795c1 Mon Sep 17 00:00:00 2001 From: Chad Date: Tue, 23 May 2023 14:14:49 -0700 Subject: [PATCH 1/3] Add automatic-clustering as custom-view attribute --- @client/opinion_views.coffee | 147 +++++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/@client/opinion_views.coffee b/@client/opinion_views.coffee index 17af3dcb7..ed587f98e 100644 --- a/@client/opinion_views.coffee +++ b/@client/opinion_views.coffee @@ -144,10 +144,157 @@ window.get_participant_attributes = -> input_type: tag.self_report?.input continuous_value: tag.continuous_value + window.ensureUsersClustered() + attributes.push + key: 'clusters' + name: 'User Clusters' + pass: (user, value) -> userIdToClusterId[ user ] ? 'None' + options: Array.from( new Set( Object.values(userIdToClusterId).concat('None') ) ) + continuous_value: false + attributes +userIdToClusterId = null + +class CosineSimilarityAccumulator + constructor: -> + @sumSq1 = 0 + @sumSq2 = 0 + @sumProducts = 0 + + increment: ( value1, value2 ) -> + value1 ?= 0 + value2 ?= 0 + @sumSq1 += (value1 * value1) + @sumSq2 += (value2 * value2) + @sumProducts += (value1 * value2) + + current: -> + if ((0 < @sumSq1) and (0 < @sumSq2)) then @sumProducts / Math.sqrt( @sumSq1 * @sumSq2 ) else 0 + + +# Use cosine-similarity because it is scale-invariant, but still anchored around zero. +# Covariance is translation-invariant, not anchored. Euclidean-distance is scale sensitive. +cosineSimilarity = ( nameToNumber1, nameToNumber2 ) -> + similarity = new CosineSimilarityAccumulator() + # For each feature that exists in nameToNumber1... increment similarity + for name,value1 of nameToNumber1 + value2 = nameToNumber2[ name ] ? 0 + similarity.increment( value1, value2 ) + # For each feature that exists only in nameToNumber2... increment similarity + for name,value2 of nameToNumber2 + if name not of nameToNumber1 + similarity.increment( 0, value2 ) + similarity.current() + + +weightedAverage = ( nameToNumber1, weight1, nameToNumber2, weight2 ) -> + nameToAverage = { } # map[ name -> weighted average of feature ] + sumWeights = weight1 + weight2 + if sumWeights == 0 + return nameToAverage + # For each feature that exists in nameToNumber1... set average + for name,value1 of nameToNumber1 + value2 = nameToNumber2[ name ] ? 0 + nameToAverage[ name ] = if (sumWeights == 0) then 0 else ( (value1 * weight1) + (value2 * weight2) ) / sumWeights + # For each feature that exists only in nameToNumber2... set average + for name,value2 of nameToNumber2 + if name not of nameToNumber1 + nameToAverage[ name ] = if (sumWeights == 0) then 0 else (value2 * weight2) / sumWeights + nameToAverage + + +window.ensureUsersClustered = ( ) -> + if userIdToClusterId + return userIdToClusterId + + # Retrieve all opinion data + proposals = fetch('/proposals').proposals + + # Collect all user x proposal opinions + userXProposalToOpinion = { } + for proposal, proposalIndex in proposals + for opinion, opinionIndex in proposal.opinions + userXProposalToOpinion[ opinion.user ] ?= { } + userXProposalToOpinion[ opinion.user ][ proposal.key ] = opinion.stance + + # Put all users into separate clusters + initialClusters = [ ] + for user, userProposalToOpinion of userXProposalToOpinion + initialClusters.push( { userIds:[user], center:userProposalToOpinion } ) + + # Automatically cluster users + clusters = window.cluster( initialClusters ) + + # Collect map[ userId -> clusterId ] + userIdToClusterId = { } + for cluster, clusterIndex in clusters + for userId, userIndex in cluster.userIds + clusterId = if ( cluster.userIds.length == 1 ) then 'None' else String( clusterIndex ) + userIdToClusterId[ userId ] = clusterId + + userIdToClusterId + + +# Automatically cluster users based on similarity of their opinions across all opinions +window.cluster = ( clusters ) -> + # Halt based on minimum-cluster-coverage and minimum-similarity. Enforce maximum-number-of-groups. + MIN_CLUSTERED_USERS_FRAC = 0.3 + MIN_SIMILARITY = 0.7 + MAX_CLUSTERS = 5 + MAX_LOOPS = 1000 + + # Cluster bottom-up, merging the most similar pair of users/clusters + numUsers = clusters.length + loopNum = 0 + # Loop until users condense to a few dense clusters, and some leftover unclustered users... + while true + ++loopNum + if MAX_LOOPS < loopNum + break + + # Enforce halting before finding candidate merge + numUsersUnclustered = clusters.filter( (c) -> (c.userIds.length == 1) ).length + numClusters = clusters.length - numUsersUnclustered + clusteredUsersFrac = ( numUsers - numUsersUnclustered ) / numUsers + suffientClusteringDone = (numClusters <= MAX_CLUSTERS) and (MIN_CLUSTERED_USERS_FRAC <= clusteredUsersFrac) + + # Find most similar cluster pair + maxSimilarity = Number.NEGATIVE_INFINITY + maxSimilarityPair = null + # For each unique pair of cluster1 x cluster2... + for cluster1, index1 in clusters + for index2 in [ index1+1 ... clusters.length ] + cluster2 = clusters[ index2 ] + similarity = cosineSimilarity( cluster1.center, cluster2.center ) + if ( maxSimilarity < similarity ) + maxSimilarity = similarity + maxSimilarityPair = [ cluster1, cluster2 ] + + # Merge most similar cluster pair + if not maxSimilarityPair then break + newCluster = { + userIds: maxSimilarityPair[0].userIds.concat( maxSimilarityPair[1].userIds ) + center: weightedAverage( maxSimilarityPair[0].center, maxSimilarityPair[0].userIds.length, maxSimilarityPair[1].center, maxSimilarityPair[1].userIds.length ) + } + + # Replace most similar pair with new cluster + newClusters = clusters.filter( (i) -> (i != maxSimilarityPair[0]) and (i != maxSimilarityPair[1]) ) + newClusters.push( newCluster ) + + # Enforce halting conditions on new candidate clusters + if suffientClusteringDone and (maxSimilarity < MIN_SIMILARITY) + break + + # Update clusters + clusters = newClusters + + clusters + + + window.get_user_groups_from_views = (groups) -> has_groups = Object.keys(groups).length > 0 From a9e3c427b17c30c17d5cbb157ca3e259b7cabcbb Mon Sep 17 00:00:00 2001 From: Chad Date: Fri, 26 May 2023 21:44:20 -0700 Subject: [PATCH 2/3] Add halting parameter MAX_GROUP_USERS_FRAC --- @client/opinion_views.coffee | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/@client/opinion_views.coffee b/@client/opinion_views.coffee index ed587f98e..ffb6006bc 100644 --- a/@client/opinion_views.coffee +++ b/@client/opinion_views.coffee @@ -241,10 +241,11 @@ window.ensureUsersClustered = ( ) -> # Automatically cluster users based on similarity of their opinions across all opinions window.cluster = ( clusters ) -> # Halt based on minimum-cluster-coverage and minimum-similarity. Enforce maximum-number-of-groups. - MIN_CLUSTERED_USERS_FRAC = 0.3 - MIN_SIMILARITY = 0.7 + MIN_CLUSTERED_USERS_FRAC = 0.50 + MAX_GROUP_USERS_FRAC = 0.30 + MIN_SIMILARITY = 0.50 MAX_CLUSTERS = 5 - MAX_LOOPS = 1000 + MAX_LOOPS = Math.floor( clusters.length * MIN_CLUSTERED_USERS_FRAC ) # Cluster bottom-up, merging the most similar pair of users/clusters numUsers = clusters.length @@ -269,7 +270,8 @@ window.cluster = ( clusters ) -> for index2 in [ index1+1 ... clusters.length ] cluster2 = clusters[ index2 ] similarity = cosineSimilarity( cluster1.center, cluster2.center ) - if ( maxSimilarity < similarity ) + groupFrac = ( cluster1.userIds.length + cluster2.userIds.length ) / numUsers + if ( maxSimilarity < similarity ) and ( groupFrac < MAX_GROUP_USERS_FRAC ) maxSimilarity = similarity maxSimilarityPair = [ cluster1, cluster2 ] From 680733413a6a8adc2ae9523a6a1d3b57ef09404e Mon Sep 17 00:00:00 2001 From: Chad Date: Mon, 29 May 2023 22:02:42 -0700 Subject: [PATCH 3/3] Move clustering to a separate worker thread --- @client/cluster.coffee | 126 +++++++++++++++++++++++++ @client/opinion_views.coffee | 177 ++++++++--------------------------- @client/web_worker.coffee | 6 ++ 3 files changed, 170 insertions(+), 139 deletions(-) create mode 100644 @client/cluster.coffee diff --git a/@client/cluster.coffee b/@client/cluster.coffee new file mode 100644 index 000000000..8bade9584 --- /dev/null +++ b/@client/cluster.coffee @@ -0,0 +1,126 @@ + +class CosineSimilarityAccumulator + constructor: -> + @sumSq1 = 0 + @sumSq2 = 0 + @sumProducts = 0 + + increment: ( value1, value2 ) -> + value1 ?= 0 + value2 ?= 0 + @sumSq1 += (value1 * value1) + @sumSq2 += (value2 * value2) + @sumProducts += (value1 * value2) + + current: -> + if ((0 < @sumSq1) and (0 < @sumSq2)) then @sumProducts / Math.sqrt( @sumSq1 * @sumSq2 ) else 0 + + +# Use cosine-similarity because it is scale-invariant, but still anchored around zero. +# Covariance is translation-invariant, not anchored. Euclidean-distance is scale sensitive. +cosineSimilarity = ( nameToNumber1, nameToNumber2 ) -> + similarity = new CosineSimilarityAccumulator() + # For each feature that exists in nameToNumber1... increment similarity + for name,value1 of nameToNumber1 + value2 = nameToNumber2[ name ] ? 0 + similarity.increment( value1, value2 ) + # For each feature that exists only in nameToNumber2... increment similarity + for name,value2 of nameToNumber2 + if name not of nameToNumber1 + similarity.increment( 0, value2 ) + similarity.current() + + +weightedAverage = ( nameToNumber1, weight1, nameToNumber2, weight2 ) -> + nameToAverage = { } # map[ name -> weighted average of feature ] + sumWeights = weight1 + weight2 + if sumWeights == 0 + return nameToAverage + # For each feature that exists in nameToNumber1... set average + for name,value1 of nameToNumber1 + value2 = nameToNumber2[ name ] ? 0 + nameToAverage[ name ] = if (sumWeights == 0) then 0 else ( (value1 * weight1) + (value2 * weight2) ) / sumWeights + # For each feature that exists only in nameToNumber2... set average + for name,value2 of nameToNumber2 + if name not of nameToNumber1 + nameToAverage[ name ] = if (sumWeights == 0) then 0 else (value2 * weight2) / sumWeights + nameToAverage + + +# Automatically cluster users based on similarity of their opinions across all opinions +cluster = ( clusters ) -> + # Halt based on minimum-cluster-coverage and minimum-similarity. Enforce maximum-number-of-groups. + MIN_CLUSTERED_USERS_FRAC = 0.50 + MAX_GROUP_USERS_FRAC = 0.30 + MIN_SIMILARITY = 0.50 + MAX_CLUSTERS = 5 + MAX_LOOPS = Math.floor( clusters.length * MIN_CLUSTERED_USERS_FRAC ) + # Cluster bottom-up, merging the most similar pair of users/clusters + numUsers = clusters.length + loopNum = 0 + # Loop until users condense to a few dense clusters, and some leftover unclustered users... + while true + ++loopNum + if MAX_LOOPS < loopNum + break + + # Enforce halting before finding candidate merge + numUsersUnclustered = clusters.filter( (c) -> (c.userIds.length == 1) ).length + numClusters = clusters.length - numUsersUnclustered + clusteredUsersFrac = ( numUsers - numUsersUnclustered ) / numUsers + suffientClusteringDone = (numClusters <= MAX_CLUSTERS) and (MIN_CLUSTERED_USERS_FRAC <= clusteredUsersFrac) + + # Find most similar cluster pair + maxSimilarity = Number.NEGATIVE_INFINITY + maxSimilarityPair = null + # For each unique pair of cluster1 x cluster2... + for cluster1, index1 in clusters + for index2 in [ index1+1 ... clusters.length ] + cluster2 = clusters[ index2 ] + similarity = cosineSimilarity( cluster1.center, cluster2.center ) + groupFrac = ( cluster1.userIds.length + cluster2.userIds.length ) / numUsers + if ( maxSimilarity < similarity ) and ( groupFrac < MAX_GROUP_USERS_FRAC ) + maxSimilarity = similarity + maxSimilarityPair = [ cluster1, cluster2 ] + + # Merge most similar cluster pair + if not maxSimilarityPair then break + newCluster = { + userIds: maxSimilarityPair[0].userIds.concat( maxSimilarityPair[1].userIds ) + center: weightedAverage( maxSimilarityPair[0].center, maxSimilarityPair[0].userIds.length, maxSimilarityPair[1].center, maxSimilarityPair[1].userIds.length ) + } + + # Replace most similar pair with new cluster + newClusters = clusters.filter( (i) -> (i != maxSimilarityPair[0]) and (i != maxSimilarityPair[1]) ) + newClusters.push( newCluster ) + + # Enforce halting conditions on new candidate clusters + if suffientClusteringDone and (maxSimilarity < MIN_SIMILARITY) + break + + # Update clusters + clusters = newClusters + + clusters + + + +DedicatedWorkerGlobalScope.clusterAndMap = ( userXProposalToOpinion ) -> + # Put all users into separate clusters + initialClusters = [ ] + for user, userProposalToOpinion of userXProposalToOpinion + initialClusters.push( { userIds:[user], center:userProposalToOpinion } ) + + # Group users + clusters = cluster( initialClusters ) + + # Collect map[ userId -> clusterId ] + userIdToClusterId = { } + for cluster, clusterIndex in clusters + for userId, userIndex in cluster.userIds + clusterId = if ( cluster.userIds.length == 1 ) then 'None' else String( clusterIndex ) + userIdToClusterId[ userId ] = clusterId + + userIdToClusterId + + diff --git a/@client/opinion_views.coffee b/@client/opinion_views.coffee index ffb6006bc..d25385d5e 100644 --- a/@client/opinion_views.coffee +++ b/@client/opinion_views.coffee @@ -86,6 +86,10 @@ window.get_opinions_for_proposal = (opinions, proposal, weights) -> (o for o in opinions when weights[o.user] > 0) + +CLUSTERS_CACHE_KEY = 'clusters' +userIdToClusterId = null + # Attributes are data about participants that can serve as additional filters. # We take them from legacy opinion views, and from # qualifying user_tags @@ -144,157 +148,52 @@ window.get_participant_attributes = -> input_type: tag.self_report?.input continuous_value: tag.continuous_value - window.ensureUsersClustered() - attributes.push - key: 'clusters' - name: 'User Clusters' - pass: (user, value) -> userIdToClusterId[ user ] ? 'None' - options: Array.from( new Set( Object.values(userIdToClusterId).concat('None') ) ) - continuous_value: false + # Cluster results will be available in cache, but only on later calls to get_participant_attributes() + userIdToClusterId = fetch( CLUSTERS_CACHE_KEY )?.userIdToClusterId + if userIdToClusterId + attributes.push + key: 'clusters' + name: 'User Clusters' + pass: (user, value) -> userIdToClusterId[ user ] ? 'None' + options: Array.from( new Set( Object.values(userIdToClusterId).concat('None') ) ) + continuous_value: false + else + # Start clustering in a worker thread + initiateClustering() attributes +clusterWorker = null -userIdToClusterId = null - -class CosineSimilarityAccumulator - constructor: -> - @sumSq1 = 0 - @sumSq2 = 0 - @sumProducts = 0 - - increment: ( value1, value2 ) -> - value1 ?= 0 - value2 ?= 0 - @sumSq1 += (value1 * value1) - @sumSq2 += (value2 * value2) - @sumProducts += (value1 * value2) - - current: -> - if ((0 < @sumSq1) and (0 < @sumSq2)) then @sumProducts / Math.sqrt( @sumSq1 * @sumSq2 ) else 0 - - -# Use cosine-similarity because it is scale-invariant, but still anchored around zero. -# Covariance is translation-invariant, not anchored. Euclidean-distance is scale sensitive. -cosineSimilarity = ( nameToNumber1, nameToNumber2 ) -> - similarity = new CosineSimilarityAccumulator() - # For each feature that exists in nameToNumber1... increment similarity - for name,value1 of nameToNumber1 - value2 = nameToNumber2[ name ] ? 0 - similarity.increment( value1, value2 ) - # For each feature that exists only in nameToNumber2... increment similarity - for name,value2 of nameToNumber2 - if name not of nameToNumber1 - similarity.increment( 0, value2 ) - similarity.current() - - -weightedAverage = ( nameToNumber1, weight1, nameToNumber2, weight2 ) -> - nameToAverage = { } # map[ name -> weighted average of feature ] - sumWeights = weight1 + weight2 - if sumWeights == 0 - return nameToAverage - # For each feature that exists in nameToNumber1... set average - for name,value1 of nameToNumber1 - value2 = nameToNumber2[ name ] ? 0 - nameToAverage[ name ] = if (sumWeights == 0) then 0 else ( (value1 * weight1) + (value2 * weight2) ) / sumWeights - # For each feature that exists only in nameToNumber2... set average - for name,value2 of nameToNumber2 - if name not of nameToNumber1 - nameToAverage[ name ] = if (sumWeights == 0) then 0 else (value2 * weight2) / sumWeights - nameToAverage - - -window.ensureUsersClustered = ( ) -> - if userIdToClusterId - return userIdToClusterId - - # Retrieve all opinion data - proposals = fetch('/proposals').proposals +initiateClustering = ( ) -> + # Only allow one cluster worker at a time + if clusterWorker + return # Collect all user x proposal opinions + proposals = fetch('/proposals').proposals userXProposalToOpinion = { } for proposal, proposalIndex in proposals for opinion, opinionIndex in proposal.opinions userXProposalToOpinion[ opinion.user ] ?= { } userXProposalToOpinion[ opinion.user ][ proposal.key ] = opinion.stance - - # Put all users into separate clusters - initialClusters = [ ] - for user, userProposalToOpinion of userXProposalToOpinion - initialClusters.push( { userIds:[user], center:userProposalToOpinion } ) - - # Automatically cluster users - clusters = window.cluster( initialClusters ) - - # Collect map[ userId -> clusterId ] - userIdToClusterId = { } - for cluster, clusterIndex in clusters - for userId, userIndex in cluster.userIds - clusterId = if ( cluster.userIds.length == 1 ) then 'None' else String( clusterIndex ) - userIdToClusterId[ userId ] = clusterId - - userIdToClusterId - - -# Automatically cluster users based on similarity of their opinions across all opinions -window.cluster = ( clusters ) -> - # Halt based on minimum-cluster-coverage and minimum-similarity. Enforce maximum-number-of-groups. - MIN_CLUSTERED_USERS_FRAC = 0.50 - MAX_GROUP_USERS_FRAC = 0.30 - MIN_SIMILARITY = 0.50 - MAX_CLUSTERS = 5 - MAX_LOOPS = Math.floor( clusters.length * MIN_CLUSTERED_USERS_FRAC ) - - # Cluster bottom-up, merging the most similar pair of users/clusters - numUsers = clusters.length - loopNum = 0 - # Loop until users condense to a few dense clusters, and some leftover unclustered users... - while true - ++loopNum - if MAX_LOOPS < loopNum - break - - # Enforce halting before finding candidate merge - numUsersUnclustered = clusters.filter( (c) -> (c.userIds.length == 1) ).length - numClusters = clusters.length - numUsersUnclustered - clusteredUsersFrac = ( numUsers - numUsersUnclustered ) / numUsers - suffientClusteringDone = (numClusters <= MAX_CLUSTERS) and (MIN_CLUSTERED_USERS_FRAC <= clusteredUsersFrac) - - # Find most similar cluster pair - maxSimilarity = Number.NEGATIVE_INFINITY - maxSimilarityPair = null - # For each unique pair of cluster1 x cluster2... - for cluster1, index1 in clusters - for index2 in [ index1+1 ... clusters.length ] - cluster2 = clusters[ index2 ] - similarity = cosineSimilarity( cluster1.center, cluster2.center ) - groupFrac = ( cluster1.userIds.length + cluster2.userIds.length ) / numUsers - if ( maxSimilarity < similarity ) and ( groupFrac < MAX_GROUP_USERS_FRAC ) - maxSimilarity = similarity - maxSimilarityPair = [ cluster1, cluster2 ] - - # Merge most similar cluster pair - if not maxSimilarityPair then break - newCluster = { - userIds: maxSimilarityPair[0].userIds.concat( maxSimilarityPair[1].userIds ) - center: weightedAverage( maxSimilarityPair[0].center, maxSimilarityPair[0].userIds.length, maxSimilarityPair[1].center, maxSimilarityPair[1].userIds.length ) - } - - # Replace most similar pair with new cluster - newClusters = clusters.filter( (i) -> (i != maxSimilarityPair[0]) and (i != maxSimilarityPair[1]) ) - newClusters.push( newCluster ) - - # Enforce halting conditions on new candidate clusters - if suffientClusteringDone and (maxSimilarity < MIN_SIMILARITY) - break - - # Update clusters - clusters = newClusters - - clusters - + parameters = { task:'cluster', userXProposalToOpinion:userXProposalToOpinion } + console.debug( 'initiateClustering() number of users =', Object.keys(userXProposalToOpinion).length ) + + # Start clustering on worker thread + clusterWorker = new Worker( '/build/web_worker.js' ) + clusterWorker.postMessage( parameters ) + clusterWorker.onmessage = ( result ) -> + clusterWorker = null + if (not result) or (not result.data) + return + console.debug( 'initiateClustering() clusterWorker.onmessage() number of users =', Object.keys(result.data.userIdToClusterId).length ) + # Update user attributes and custom filtering display + # Do not save to server -- not implemented, and dont want to use that much server storage per user + clustersRecord = fetch( CLUSTERS_CACHE_KEY ) ? { key:CLUSTERS_CACHE_KEY } + clustersRecord.userIdToClusterId = result.data.userIdToClusterId + save( clustersRecord ) diff --git a/@client/web_worker.coffee b/@client/web_worker.coffee index 47775bfba..9395c8027 100644 --- a/@client/web_worker.coffee +++ b/@client/web_worker.coffee @@ -1,4 +1,6 @@ require './histogram_layout' +require './cluster' + addEventListener 'message', (ev) -> msg = ev.data @@ -6,3 +8,7 @@ addEventListener 'message', (ev) -> if msg.task == 'layoutAvatars' DedicatedWorkerGlobalScope.enqueue_histo_layout msg + else if msg.task == 'cluster' + userIdToClusterId = DedicatedWorkerGlobalScope.clusterAndMap( msg.userXProposalToOpinion ) + postMessage( {result:'clustered', userIdToClusterId:userIdToClusterId} ) +