Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions @client/cluster.coffee
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@

class CosineSimilarityAccumulator
constructor: ->
@sumSq1 = 0
@sumSq2 = 0
@sumProducts = 0

increment: ( value1, value2 ) ->
value1 ?= 0
value2 ?= 0
@sumSq1 += (value1 * value1)
@sumSq2 += (value2 * value2)
@sumProducts += (value1 * value2)

current: ->
if ((0 < @sumSq1) and (0 < @sumSq2)) then @sumProducts / Math.sqrt( @sumSq1 * @sumSq2 ) else 0


# Use cosine-similarity because it is scale-invariant, but still anchored around zero.
# Covariance is translation-invariant, not anchored. Euclidean-distance is scale sensitive.
cosineSimilarity = ( nameToNumber1, nameToNumber2 ) ->
similarity = new CosineSimilarityAccumulator()
# For each feature that exists in nameToNumber1... increment similarity
for name,value1 of nameToNumber1
value2 = nameToNumber2[ name ] ? 0
similarity.increment( value1, value2 )
# For each feature that exists only in nameToNumber2... increment similarity
for name,value2 of nameToNumber2
if name not of nameToNumber1
similarity.increment( 0, value2 )
similarity.current()


weightedAverage = ( nameToNumber1, weight1, nameToNumber2, weight2 ) ->
nameToAverage = { } # map[ name -> weighted average of feature ]
sumWeights = weight1 + weight2
if sumWeights == 0
return nameToAverage
# For each feature that exists in nameToNumber1... set average
for name,value1 of nameToNumber1
value2 = nameToNumber2[ name ] ? 0
nameToAverage[ name ] = if (sumWeights == 0) then 0 else ( (value1 * weight1) + (value2 * weight2) ) / sumWeights
# For each feature that exists only in nameToNumber2... set average
for name,value2 of nameToNumber2
if name not of nameToNumber1
nameToAverage[ name ] = if (sumWeights == 0) then 0 else (value2 * weight2) / sumWeights
nameToAverage


# Automatically cluster users based on similarity of their opinions across all opinions
cluster = ( clusters ) ->
# Halt based on minimum-cluster-coverage and minimum-similarity. Enforce maximum-number-of-groups.
MIN_CLUSTERED_USERS_FRAC = 0.50
MAX_GROUP_USERS_FRAC = 0.30
MIN_SIMILARITY = 0.50
MAX_CLUSTERS = 5
MAX_LOOPS = Math.floor( clusters.length * MIN_CLUSTERED_USERS_FRAC )
# Cluster bottom-up, merging the most similar pair of users/clusters
numUsers = clusters.length
loopNum = 0
# Loop until users condense to a few dense clusters, and some leftover unclustered users...
while true
++loopNum
if MAX_LOOPS < loopNum
break

# Enforce halting before finding candidate merge
numUsersUnclustered = clusters.filter( (c) -> (c.userIds.length == 1) ).length
numClusters = clusters.length - numUsersUnclustered
clusteredUsersFrac = ( numUsers - numUsersUnclustered ) / numUsers
suffientClusteringDone = (numClusters <= MAX_CLUSTERS) and (MIN_CLUSTERED_USERS_FRAC <= clusteredUsersFrac)

# Find most similar cluster pair
maxSimilarity = Number.NEGATIVE_INFINITY
maxSimilarityPair = null
# For each unique pair of cluster1 x cluster2...
for cluster1, index1 in clusters
for index2 in [ index1+1 ... clusters.length ]
cluster2 = clusters[ index2 ]
similarity = cosineSimilarity( cluster1.center, cluster2.center )
groupFrac = ( cluster1.userIds.length + cluster2.userIds.length ) / numUsers
if ( maxSimilarity < similarity ) and ( groupFrac < MAX_GROUP_USERS_FRAC )
maxSimilarity = similarity
maxSimilarityPair = [ cluster1, cluster2 ]

# Merge most similar cluster pair
if not maxSimilarityPair then break
newCluster = {
userIds: maxSimilarityPair[0].userIds.concat( maxSimilarityPair[1].userIds )
center: weightedAverage( maxSimilarityPair[0].center, maxSimilarityPair[0].userIds.length, maxSimilarityPair[1].center, maxSimilarityPair[1].userIds.length )
}

# Replace most similar pair with new cluster
newClusters = clusters.filter( (i) -> (i != maxSimilarityPair[0]) and (i != maxSimilarityPair[1]) )
newClusters.push( newCluster )

# Enforce halting conditions on new candidate clusters
if suffientClusteringDone and (maxSimilarity < MIN_SIMILARITY)
break

# Update clusters
clusters = newClusters

clusters



DedicatedWorkerGlobalScope.clusterAndMap = ( userXProposalToOpinion ) ->
# Put all users into separate clusters
initialClusters = [ ]
for user, userProposalToOpinion of userXProposalToOpinion
initialClusters.push( { userIds:[user], center:userProposalToOpinion } )

# Group users
clusters = cluster( initialClusters )

# Collect map[ userId -> clusterId ]
userIdToClusterId = { }
for cluster, clusterIndex in clusters
for userId, userIndex in cluster.userIds
clusterId = if ( cluster.userIds.length == 1 ) then 'None' else String( clusterIndex )
userIdToClusterId[ userId ] = clusterId

userIdToClusterId


48 changes: 48 additions & 0 deletions @client/opinion_views.coffee
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,10 @@ window.get_opinions_for_proposal = (opinions, proposal, weights) ->
(o for o in opinions when weights[o.user] > 0)



CLUSTERS_CACHE_KEY = 'clusters'
userIdToClusterId = null

# Attributes are data about participants that can serve as additional filters.
# We take them from legacy opinion views, and from
# qualifying user_tags
Expand Down Expand Up @@ -144,9 +148,53 @@ window.get_participant_attributes = ->
input_type: tag.self_report?.input
continuous_value: tag.continuous_value

# Cluster results will be available in cache, but only on later calls to get_participant_attributes()
userIdToClusterId = fetch( CLUSTERS_CACHE_KEY )?.userIdToClusterId
if userIdToClusterId
attributes.push
key: 'clusters'
name: 'User Clusters'
pass: (user, value) -> userIdToClusterId[ user ] ? 'None'
options: Array.from( new Set( Object.values(userIdToClusterId).concat('None') ) )
continuous_value: false
else
# Start clustering in a worker thread
initiateClustering()

attributes


clusterWorker = null

initiateClustering = ( ) ->
# Only allow one cluster worker at a time
if clusterWorker
return

# Collect all user x proposal opinions
proposals = fetch('/proposals').proposals
userXProposalToOpinion = { }
for proposal, proposalIndex in proposals
for opinion, opinionIndex in proposal.opinions
userXProposalToOpinion[ opinion.user ] ?= { }
userXProposalToOpinion[ opinion.user ][ proposal.key ] = opinion.stance
parameters = { task:'cluster', userXProposalToOpinion:userXProposalToOpinion }
console.debug( 'initiateClustering() number of users =', Object.keys(userXProposalToOpinion).length )

# Start clustering on worker thread
clusterWorker = new Worker( '/build/web_worker.js' )
clusterWorker.postMessage( parameters )
clusterWorker.onmessage = ( result ) ->
clusterWorker = null
if (not result) or (not result.data)
return
console.debug( 'initiateClustering() clusterWorker.onmessage() number of users =', Object.keys(result.data.userIdToClusterId).length )
# Update user attributes and custom filtering display
# Do not save to server -- not implemented, and dont want to use that much server storage per user
clustersRecord = fetch( CLUSTERS_CACHE_KEY ) ? { key:CLUSTERS_CACHE_KEY }
clustersRecord.userIdToClusterId = result.data.userIdToClusterId
save( clustersRecord )



window.get_user_groups_from_views = (groups) ->
Expand Down
6 changes: 6 additions & 0 deletions @client/web_worker.coffee
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
require './histogram_layout'
require './cluster'


addEventListener 'message', (ev) ->
msg = ev.data

if msg.task == 'layoutAvatars'
DedicatedWorkerGlobalScope.enqueue_histo_layout msg

else if msg.task == 'cluster'
userIdToClusterId = DedicatedWorkerGlobalScope.clusterAndMap( msg.userXProposalToOpinion )
postMessage( {result:'clustered', userIdToClusterId:userIdToClusterId} )