Akka streams vs spark structured streaming

val inputStream = s.readStream
.format("kinesis")
.option("streamName", "stream-name") // Kinesis Stream Name
.option("region", "ap-northeast-1") // Kinesis Region
.option("initialPosition", "TRIM_HORIZON") // Kinesis initial position
.load()
case class SampleSchema(column1: String, column2: Int)
val testSchema = ScalaReflection.schemaFor[SampleSchema].dataType.asInstanceOf[StructType]
val structuredStream = inputStream
.select(from_json(col("data").cast("string"), testSchema).as("data")) // Parse JSON and convert to testSchema
.select("data.*") // select JSON columns

val kclSource = KCLSource("scalamatsuri")
val producer = Flow[UserLog]
.map(_.toJson.compactPrint)
.map(ByteString.fromString(_))
.map(KinesisStreamObject(UUID.randomUUID().toString, _))
.toMat(KinesisProducer("scalamatsuri2", KinesisProducerSetting.fromShardCount(1)))(Keep.right)
val extract = Flow.fromFunction[ByteString, PurchaseLog](_.utf8String.parseJson.convertTo[PurchaseLog])
val convert = Flow[PurchaseLog].mapConcat { log =>
log.items.map { item =>
UserLog(
log.userId,
// properties...
)
}
}
val result =
kclSource
.via(extract)
.via(convert)
.toMat(producer)(Keep.right)
.run()

val stream = inputStream
.select(from_json(col("data").cast("string"), purchaseLogSchema).as("data"))
.select("data.*")
.select(
$"user_id",
$"time",
expr("explode(items)").as("item")
)
.select(
$"user_id",
$"time",
lit("PURCHASE").as("log_type"),
$"item.item_id".as("item_id"),
$"item.price".as("price"),
$"item.quantity".as("quantity"),
unix_timestamp().as("created_time")
)
.as[UserLog] // case class UserLog(
stream
.writeStream
.foreach(sink)
.outputMode("update")
.start()

val http = Http()
val separator = ByteString.fromString(Properties.lineSeparator)
val separatorBytes = separator.length
val MAX_FILE_SIZE = 10 * 1024 * 1024
val MAX_BYTES_PER_HOUR = 100 * 1024 * 1024
val kclSource = KCLSource("scalamatsuri")
val buffering = Flow[ByteString]
.batchWeighted[ByteString](MAX_FILE_SIZE, _.length, seed => seed)(_ ++ separator ++ _)
.throttle(MAX_BYTES_PER_HOUR, Duration(1, HOURS), _.length)
val send: Flow[ByteString, ByteString] = ??? //
val logging = Sink.foreach[ByteString] { body =>
logger.info(body.utf8String)
}
val request = kclSource
.via(buffering)
.via(send)
.toMat(logging)(Keep.right)
.run()

val inputStream = spark.readStream
.format("kinesis")
.option("streamName", scalamatsuriStraem)
.option("region", "ap-northeast-1")
.option("initialPosition", "TRIM_HORIZON")
.option("awsAccessKey", awsAccessKeyId)
.option("awsSecretKey", awsSecretKey)
.option("maxRecordsPerFetch", 100000)
.option("fetchBufferSize", 100000)
.option("maxFetchDuration", 1.0)
.load()

val strteamWithWM = stream1.as("in1")
.withColumn("dt", to_timestamp(from_unixtime(col("time"))))
.withWatermark("dt", "3 hours")
val stream2WithWM = stream2.as("in2")
.withColumn("dt", to_timestamp(from_unixtime(col("time"))))
.withWatermark("dt", "1 hours")
strteamWithWM.join(
stream2WithWM,
expr("""
in1.user_id = in2.user_id
AND
in1.time < in2.time
AND
in1.time >= in2.time + interval 3 hour
"""),
"leftOuter"
)

Akka streams vs spark structured streaming

Akka streams vs spark structured streaming

Recommended

Recommended

More Related Content

What's hot

What's hot (20)

Similar to Akka streams vs spark structured streaming

Similar to Akka streams vs spark structured streaming (20)

Recently uploaded

Recently uploaded (20)

Akka streams vs spark structured streaming