-
Notifications
You must be signed in to change notification settings - Fork 0
spout_aggregation_changes #1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,120 @@ | ||
| package com.twitter.summingbird.storm.collector | ||
|
|
||
| import com.twitter.algebird.Semigroup | ||
| import com.twitter.summingbird.online.executor.KeyValueShards | ||
| import com.twitter.summingbird.online.option.SummerBuilder | ||
| import backtype.storm.spout.SpoutOutputCollector | ||
| import com.twitter.algebird.util.summer.AsyncSummer | ||
| import com.twitter.util.{ Await, Future, Time } | ||
| import scala.collection.mutable.{ Map => MMap } | ||
| import scala.collection.mutable.{ MutableList => MList } | ||
| import scala.collection.{ Map => CMap } | ||
| import scala.collection.JavaConverters._ | ||
| import java.util.{ List => JList } | ||
|
|
||
| /** | ||
| * | ||
| * AggregatorOutputCollector is a wrapper around the SpoutOutputCollector. | ||
| * AsyncSummer is used to aggregate the tuples. | ||
| * Different streams have seperated aggregators and caches. | ||
| * | ||
| */ | ||
| class AggregatorOutputCollector[K, V: Semigroup]( | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doc comment here. |
||
| in: SpoutOutputCollector, | ||
| summerBuilder: SummerBuilder, | ||
| summerShards: KeyValueShards) extends SpoutOutputCollector(in) { | ||
|
|
||
| // Map keeps track of summers corresponding to streams. | ||
| val spoutCaches = MMap[String, AsyncSummer[(K, V), Map[K, V]]]() | ||
|
|
||
| var lastDump = Time.now.inMillis | ||
|
|
||
| // The Map keeps track of batch of aggregated tuples' messageIds. It also has a stream level tracking. | ||
| val streamMessageIdTracker = MMap[String, MMap[Int, MList[Object]]]() | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment here specifying the Map structure. |
||
|
|
||
| def timerFlush() = { | ||
| /* | ||
| This is a flush called from the nextTuple() of the spout. | ||
| The timerFlush is triggered with tick frequency from the spout. | ||
| */ | ||
| spoutCaches.foreach { | ||
| case (stream, cache) => | ||
| val tupsOut = cache.tick.map { convertToSummerInputFormat(_) } | ||
| emitData(tupsOut, stream) | ||
| } | ||
| } | ||
|
|
||
| private def convertToSummerInputFormat(flushedCache: CMap[K, V]): CMap[Int, CMap[K, V]] = | ||
| flushedCache.groupBy { case (k, _) => summerShards.summerIdFor(k) } | ||
|
|
||
| /* | ||
| The method is invoked to handle the flushed cache caused by | ||
| exceeding the memoryLimit, which is called within add method. | ||
| */ | ||
| private def emitData[K, V](cache: Future[Traversable[(Int, CMap[K, V])]], streamId: String): List[Int] = { | ||
| val flushedTups = Await.result(cache) | ||
| val messageIdsTracker = streamMessageIdTracker(streamId) | ||
| val returns = flushedTups.toList | ||
| .map { | ||
| case (k, v) => | ||
| val messageIds = messageIdsTracker.remove(k) | ||
| val list = List(k, v).asJava.asInstanceOf[JList[AnyRef]] | ||
| callEmit(messageIds, list, streamId) | ||
| } | ||
| returns.flatten | ||
| } | ||
|
|
||
| /* | ||
| This is a wrapper method to call the emit with appropriate signature | ||
| based on the arguments. | ||
| */ | ||
| private def callEmit(messageIds: Option[Any], list: JList[AnyRef], stream: String): JList[Integer] = { | ||
| (messageIds.isEmpty, stream.isEmpty) match { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This special handling of empty stream name seems odd to me. What does empty stream name signify? Can we use Option[String] instead of relying on special value of stream name? |
||
| case (true, true) => in.emit(list) | ||
| case (true, false) => in.emit(stream, list) | ||
| case (false, true) => in.emit(list, messageIds) | ||
| case (false, false) => in.emit(stream, list, messageIds) | ||
| } | ||
| } | ||
|
|
||
| private def add(tuple: (K, V), streamid: String, messageId: Option[Any] = None) = { | ||
| if (messageId.isDefined) | ||
| trackMessageId(tuple, messageId.get, streamid) | ||
| addToCache(tuple, streamid) | ||
| } | ||
|
|
||
| private def addToCache(tuple: (K, V), streamid: String) = { | ||
| spoutCaches.get(streamid) match { | ||
| case Some(cac) => cac.add(tuple) | ||
| case None => { | ||
| spoutCaches(streamid) = summerBuilder.getSummer[K, V](implicitly[Semigroup[V]]) | ||
| spoutCaches(streamid).add(tuple) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private def trackMessageId(tuple: (K, V), o: scala.Any, s: String): Unit = { | ||
| val messageIdTracker = streamMessageIdTracker.getOrElse(s, MMap[Int, MList[Object]]()) | ||
| var messageIds = messageIdTracker.getOrElse(summerShards.summerIdFor(tuple._1), MList()) | ||
| messageIdTracker(summerShards.summerIdFor(tuple._1)) = ( messageIds += o.asInstanceOf[Object] ) | ||
| streamMessageIdTracker(s) = messageIdTracker | ||
| } | ||
|
|
||
| def extractAndProcessElements(streamId: String, list: JList[AnyRef], messageId: Option[Any] = None): JList[Integer] = { | ||
| val listKV = list.get(0).asInstanceOf[JList[AnyRef]] | ||
| val first: K = listKV.get(0).asInstanceOf[K] | ||
| val second: V = listKV.get(1).asInstanceOf[V] | ||
| val emitReturn = emitData(add((first, second), streamId, messageId).map(convertToSummerInputFormat(_)), streamId) | ||
| emitReturn.asJava.asInstanceOf[JList[Integer]] | ||
| } | ||
|
|
||
| override def emit(s: String, list: JList[AnyRef], o: scala.AnyRef): JList[Integer] = extractAndProcessElements(s, list, Some(o)) | ||
|
|
||
| override def emit(list: JList[AnyRef], o: scala.AnyRef): JList[Integer] = extractAndProcessElements("", list, Some(o)) | ||
|
|
||
| override def emit(list: JList[AnyRef]): JList[Integer] = extractAndProcessElements("", list) | ||
|
|
||
| override def emit(s: String, list: JList[AnyRef]): JList[Integer] = extractAndProcessElements(s, list) | ||
|
|
||
| override def reportError(throwable: Throwable): Unit = in.reportError(throwable) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,32 +4,74 @@ import backtype.storm.spout.SpoutOutputCollector | |
| import backtype.storm.task.TopologyContext | ||
| import backtype.storm.topology.{ IRichSpout, OutputFieldsDeclarer } | ||
| import backtype.storm.tuple.Fields | ||
| import com.twitter.algebird.Semigroup | ||
| import com.twitter.summingbird.online.Externalizer | ||
| import com.twitter.summingbird.online.executor.KeyValueShards | ||
| import com.twitter.summingbird.online.option.SummerBuilder | ||
| import com.twitter.summingbird.storm.Constants._ | ||
| import com.twitter.tormenta.spout.SpoutProxy | ||
| import java.util | ||
| import java.util.{ List => JList } | ||
| import com.twitter.summingbird.storm.collector.TransformingOutputCollector | ||
| import scala.collection.mutable.{ MutableList => MList } | ||
| import com.twitter.summingbird.storm.collector.{ AggregatorOutputCollector, TransformingOutputCollector } | ||
| import com.twitter.util.{ Duration, Time } | ||
|
|
||
| /** | ||
| * This is a spout used when the spout is being followed by summer. | ||
| * It uses a TransformingOutputCollector on open. | ||
| * It uses a AggregatorOutputCollector on open. | ||
| */ | ||
|
|
||
| class KeyValueSpout(in: IRichSpout) extends SpoutProxy { | ||
| class KeyValueSpout[K, V: Semigroup](val in: IRichSpout, summerBuilder: SummerBuilder, summerShards: KeyValueShards) extends SpoutProxy { | ||
|
|
||
| private final val tickFrequency = Duration.fromMilliseconds(1000) | ||
| private var adapterCollector: AggregatorOutputCollector[K, V] = _ | ||
| var lastDump = Time.now | ||
|
|
||
| override def declareOutputFields(declarer: OutputFieldsDeclarer) = { | ||
| declarer.declare(new Fields(AGG_KEY, AGG_VALUE)) | ||
| } | ||
|
|
||
| /* | ||
| * The transform is the function which unwraps the Value object to get the actual fields present in it. | ||
| */ | ||
|
|
||
| override def open(conf: util.Map[_, _], | ||
| topologyContext: TopologyContext, | ||
| outputCollector: SpoutOutputCollector): Unit = { | ||
| val adapterCollector = new TransformingOutputCollector(outputCollector, _.get(0).asInstanceOf[JList[AnyRef]]) | ||
| self.open(conf, topologyContext, adapterCollector) | ||
| adapterCollector = new AggregatorOutputCollector(outputCollector, summerBuilder, summerShards) | ||
| in.open(conf, topologyContext, adapterCollector) | ||
| } | ||
|
|
||
| override def nextTuple(): Unit = { | ||
| /* | ||
| This method is used to call the tick on the cache. | ||
| */ | ||
| if (Time.now - lastDump > tickFrequency) { | ||
| adapterCollector.timerFlush() | ||
| lastDump = Time.now | ||
| } | ||
| in.nextTuple() | ||
| } | ||
|
|
||
| override def ack(msgId: Object): Unit = { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you comment who calls ack with a msgId which is actually a list of message ids. Seems pretty unintuitive so I want to understand for my own knowledge. |
||
| /* | ||
| The msgId is a list of individual messageIds of emitted tuples | ||
| which are aggregated and emitted out as a single tuple. | ||
| */ | ||
| val msgIds = convertToList(msgId) | ||
| msgIds.foreach { super.ack(_) } | ||
| } | ||
|
|
||
| override def fail(msgId: Object): Unit = { | ||
| /* | ||
| The msgId is a list of individual messageIds of emitted tuples | ||
| which are aggregated and emitted out as a single tuple. | ||
| */ | ||
| val msgIds = convertToList(msgId) | ||
| msgIds.foreach { super.fail(_) } | ||
| } | ||
|
|
||
| def convertToList(msgId: Object): MList[Object] = { | ||
| msgId match { | ||
| case Some(s) => s.asInstanceOf[MList[Object]] | ||
| case None => MList[Object]() | ||
| } | ||
| } | ||
|
|
||
| override protected def self: IRichSpout = in | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's move this to TransformingOutputCollector, etc.