Considerit · chadbrower0 · May 23, 2023 · May 27, 2023 · May 30, 2023
diff --git a/@client/cluster.coffee b/@client/cluster.coffee
@@ -0,0 +1,126 @@
+
+class CosineSimilarityAccumulator
+  constructor: ->
+    @sumSq1 = 0
+    @sumSq2 = 0
+    @sumProducts = 0
+
+  increment: ( value1, value2 ) ->
+    value1 ?= 0
+    value2 ?= 0
+    @sumSq1 += (value1 * value1)
+    @sumSq2 += (value2 * value2)
+    @sumProducts += (value1 * value2)
+
+  current: ->
+    if ((0 < @sumSq1) and (0 < @sumSq2))  then @sumProducts / Math.sqrt( @sumSq1 * @sumSq2 )  else 0
+
+
+# Use cosine-similarity because it is scale-invariant, but still anchored around zero.
+# Covariance is translation-invariant, not anchored.  Euclidean-distance is scale sensitive.
+cosineSimilarity = ( nameToNumber1, nameToNumber2 ) ->
+  similarity = new CosineSimilarityAccumulator()
+  # For each feature that exists in nameToNumber1... increment similarity
+  for name,value1 of nameToNumber1
+    value2 = nameToNumber2[ name ] ?  0
+    similarity.increment( value1, value2 )
+  # For each feature that exists only in nameToNumber2... increment similarity
+  for name,value2 of nameToNumber2
+    if name not of nameToNumber1
+      similarity.increment( 0, value2 )
+  similarity.current()
+
+
+weightedAverage = ( nameToNumber1, weight1, nameToNumber2, weight2 ) ->
+  nameToAverage = { }  # map[ name -> weighted average of feature ]
+  sumWeights = weight1 + weight2
+  if sumWeights == 0
+    return nameToAverage
+  # For each feature that exists in nameToNumber1... set average
+  for name,value1 of nameToNumber1
+    value2 = nameToNumber2[ name ] ?  0
+    nameToAverage[ name ] = if (sumWeights == 0)  then 0  else ( (value1 * weight1) + (value2 * weight2) ) / sumWeights
+  # For each feature that exists only in nameToNumber2... set average
+  for name,value2 of nameToNumber2
+    if name not of nameToNumber1
+      nameToAverage[ name ] = if (sumWeights == 0)  then 0  else (value2 * weight2) / sumWeights
+  nameToAverage
+
+
+# Automatically cluster users based on similarity of their opinions across all opinions
+cluster = ( clusters ) ->
+  # Halt based on minimum-cluster-coverage and minimum-similarity.  Enforce maximum-number-of-groups.
+  MIN_CLUSTERED_USERS_FRAC = 0.50
+  MAX_GROUP_USERS_FRAC = 0.30
+  MIN_SIMILARITY = 0.50
+  MAX_CLUSTERS = 5
+  MAX_LOOPS = Math.floor( clusters.length * MIN_CLUSTERED_USERS_FRAC )
+  # Cluster bottom-up, merging the most similar pair of users/clusters
+  numUsers = clusters.length
+  loopNum = 0
+  # Loop until users condense to a few dense clusters, and some leftover unclustered users...
+  while true
+    ++loopNum
+    if MAX_LOOPS < loopNum
+      break
+
+    # Enforce halting before finding candidate merge
+    numUsersUnclustered = clusters.filter( (c) -> (c.userIds.length == 1) ).length
+    numClusters = clusters.length - numUsersUnclustered
+    clusteredUsersFrac = ( numUsers - numUsersUnclustered ) / numUsers
+    suffientClusteringDone = (numClusters <= MAX_CLUSTERS) and (MIN_CLUSTERED_USERS_FRAC <= clusteredUsersFrac)
+
+    # Find most similar cluster pair
+    maxSimilarity = Number.NEGATIVE_INFINITY
+    maxSimilarityPair = null
+    # For each unique pair of cluster1 x cluster2...
+    for cluster1, index1 in clusters
+      for index2 in [ index1+1 ... clusters.length ]
+        cluster2 = clusters[ index2 ]
+        similarity = cosineSimilarity( cluster1.center, cluster2.center )
+        groupFrac = ( cluster1.userIds.length + cluster2.userIds.length ) / numUsers
+        if ( maxSimilarity < similarity ) and ( groupFrac < MAX_GROUP_USERS_FRAC )
+          maxSimilarity = similarity
+          maxSimilarityPair = [ cluster1, cluster2 ]
+
+    # Merge most similar cluster pair
+    if not maxSimilarityPair  then break
+    newCluster = { 
+      userIds: maxSimilarityPair[0].userIds.concat( maxSimilarityPair[1].userIds ) 
+      center: weightedAverage( maxSimilarityPair[0].center, maxSimilarityPair[0].userIds.length, maxSimilarityPair[1].center, maxSimilarityPair[1].userIds.length )
+    }
+
+    # Replace most similar pair with new cluster
+    newClusters = clusters.filter( (i) -> (i != maxSimilarityPair[0]) and (i != maxSimilarityPair[1]) )
+    newClusters.push( newCluster )
+
+    # Enforce halting conditions on new candidate clusters
+    if suffientClusteringDone and (maxSimilarity < MIN_SIMILARITY)
+      break
+
+    # Update clusters
+    clusters = newClusters
+
+  clusters
+
+
+
+DedicatedWorkerGlobalScope.clusterAndMap = ( userXProposalToOpinion ) ->
+  # Put all users into separate clusters
+  initialClusters = [ ]
+  for user, userProposalToOpinion of userXProposalToOpinion
+    initialClusters.push(  { userIds:[user], center:userProposalToOpinion }  )
+
+  # Group users
+  clusters = cluster( initialClusters )
+
+  # Collect map[ userId -> clusterId ]
+  userIdToClusterId = { }
+  for cluster, clusterIndex in clusters
+    for userId, userIndex in cluster.userIds
+      clusterId = if ( cluster.userIds.length == 1 )  then 'None'  else String( clusterIndex )
+      userIdToClusterId[ userId ] = clusterId
+
+  userIdToClusterId
+
+
diff --git a/@client/opinion_views.coffee b/@client/opinion_views.coffee
@@ -86,6 +86,10 @@ window.get_opinions_for_proposal = (opinions, proposal, weights) ->
   (o for o in opinions when weights[o.user] > 0)
 
 
+
+CLUSTERS_CACHE_KEY = 'clusters'
+userIdToClusterId = null
+
 # Attributes are data about participants that can serve as additional filters. 
 # We take them from legacy opinion views, and from 
 # qualifying user_tags
@@ -144,9 +148,53 @@ window.get_participant_attributes = ->
             input_type: tag.self_report?.input
             continuous_value: tag.continuous_value
 
+    # Cluster results will be available in cache, but only on later calls to get_participant_attributes()
+    userIdToClusterId = fetch( CLUSTERS_CACHE_KEY )?.userIdToClusterId
+    if userIdToClusterId
+      attributes.push
+        key: 'clusters'
+        name: 'User Clusters'
+        pass: (user, value) -> userIdToClusterId[ user ] ?  'None'
+        options: Array.from(  new Set( Object.values(userIdToClusterId).concat('None') )  )
+        continuous_value: false
+    else
+      # Start clustering in a worker thread
+      initiateClustering()
+
   attributes
 
 
+clusterWorker = null
+
+initiateClustering = ( ) ->
+  # Only allow one cluster worker at a time
+  if clusterWorker
+    return
+
+  # Collect all user x proposal opinions
+  proposals = fetch('/proposals').proposals
+  userXProposalToOpinion = { }
+  for proposal, proposalIndex in proposals
+    for opinion, opinionIndex in proposal.opinions
+      userXProposalToOpinion[ opinion.user ] ?= { }
+      userXProposalToOpinion[ opinion.user ][ proposal.key ] = opinion.stance
+  parameters = { task:'cluster', userXProposalToOpinion:userXProposalToOpinion }
+  console.debug( 'initiateClustering() number of users =', Object.keys(userXProposalToOpinion).length )
+
+  # Start clustering on worker thread
+  clusterWorker = new Worker( '/build/web_worker.js' )
+  clusterWorker.postMessage( parameters )
+  clusterWorker.onmessage = ( result ) ->
+    clusterWorker = null
+    if (not result) or (not result.data)
+      return
+    console.debug( 'initiateClustering() clusterWorker.onmessage() number of users =', Object.keys(result.data.userIdToClusterId).length )
+    # Update user attributes and custom filtering display
+    # Do not save to server -- not implemented, and dont want to use that much server storage per user
+    clustersRecord = fetch( CLUSTERS_CACHE_KEY ) ?  { key:CLUSTERS_CACHE_KEY }
+    clustersRecord.userIdToClusterId = result.data.userIdToClusterId
+    save( clustersRecord )
+
 
 
 window.get_user_groups_from_views = (groups) ->

diff --git a/@client/web_worker.coffee b/@client/web_worker.coffee
@@ -1,8 +1,14 @@
 require './histogram_layout'
+require './cluster'
+
 
 addEventListener 'message', (ev) ->
   msg = ev.data
 
   if msg.task == 'layoutAvatars' 
     DedicatedWorkerGlobalScope.enqueue_histo_layout msg
 
+  else if msg.task == 'cluster'
+    userIdToClusterId = DedicatedWorkerGlobalScope.clusterAndMap( msg.userXProposalToOpinion )
+    postMessage( {result:'clustered', userIdToClusterId:userIdToClusterId} )
+