1.实时热门商品统计
需求:每隔 5 分钟输出最近一小时内点击量最多的前 N 个商品
package userbehavior35
//需求 每隔五分钟输出最近一小时内点击量最多的前 N 个商品。
/*
抽取出业务时间戳,告诉 Flink 框架基于业务时间做窗口
过滤出点击行为数据
按一小时的窗口大小,每 5 分钟统计一次,做滑动窗口聚合(Sliding Window)
按每个窗口聚合,输出每个窗口中点击量前 N 名的商品
*/
import java.lang
import java.sql.Timestamp
import java.time.Duration
import org.apache.flink.api.common.RuntimeExecutionMode
import org.apache.flink.api.common.eventtime.{SerializableTimestampAssigner, WatermarkStrategy}
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, ValueState, ValueStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import scala.collection.mutable.ListBuffer
case class UserBehavior(userId:Long,itemId:Long,categoryId:Long,behavior:String,timestamp:Long)
object HotItem {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setRuntimeMode(RuntimeExecutionMode.AUTOMATIC)
env.setParallelism(1)
val source = env.readTextFile("D:\\study\\Code\\UserBehavior\\src\\main\\resources\\UserBehavior.csv")
val dataStream: DataStream[UserBehavior] = source.map(line => {
val split = line.split(",")
val userId = split(0).trim.toLong
val itemId = split(1).trim.toLong
val categoryId = split(2).trim.toLong
val behavior = split(3).trim
val timestamp = split(4).trim.toLong
UserBehavior(userId, itemId, categoryId, behavior, timestamp)
})
.assignTimestampsAndWatermarks(WatermarkStrategy.forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner[UserBehavior] {
override def extractTimestamp(element: UserBehavior, recordTimestamp: Long): Long = element.timestamp * 1000L
}))
val aggDataStream = dataStream.filter(_.behavior == "pv")
.keyBy(_.itemId)
.timeWindow(Time.hours(1),Time.minutes(5))
.aggregate(new CountAgg(),new WindowResultFunction())
/*val aggDataStream = dataStream.filter(_.behavior == "pv")
.keyBy(_.itemId)
.timeWindow(Time.hours(1), Time.minutes(5))
.aggregate(new CountAgg(), new WindowResultFunction())*/
//对窗口内的数据 排序输出
aggDataStream.keyBy(_.windowEnd)
.process(new TopNHotItems(3))
.print()
env.execute("job")
}
}
class CountAgg() extends AggregateFunction[UserBehavior,Long,Long]{
//初始化
override def createAccumulator(): Long = 0L
//累加
override def add(in: UserBehavior, acc: Long): Long = acc+1
//取值
override def getResult(acc: Long): Long = acc
//多个分区的合并
override def merge(acc: Long, acc1: Long): Long = acc + acc1
}
case class ItemViewCount(itemId:Long,windowEnd:Long,count:Long)
class WindowResultFunction() extends WindowFunction[Long,ItemViewCount,Long,TimeWindow]{
override def apply(key: Long, window: TimeWindow, input: Iterable[Long], out: Collector[ItemViewCount]): Unit = {
val itemId=key
val windowEnd=window.getEnd
val count=input.iterator.next()
out.collect(ItemViewCount(itemId,windowEnd,count))
}
}
class TopNHotItems(size:Int) extends KeyedProcessFunction[Long,ItemViewCount,String]{
//定义状态 保存窗口内的所有数据
var listState: ListState[ItemViewCount] = _
var timerState: ValueState[Long] = _
override def open(parameters: Configuration): Unit = {
listState = getRuntimeContext.getListState(new ListStateDescriptor[ItemViewCount]("listState",classOf[ItemViewCount]))
timerState= getRuntimeContext.getState(new ValueStateDescriptor[Long]("timerState",classOf[Long]))
}
override def processElement(i: ItemViewCount, context: KeyedProcessFunction[Long, ItemViewCount, String]#Context, collector: Collector[String]): Unit = {
listState.add(i)
//注册定时器 窗口下一秒执行
val ts = i.windowEnd + 1
context.timerService().registerEventTimeTimer(ts)
}
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, ItemViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
//取出窗口的所有数据 java集合
val itemViewCounts: lang.Iterable[ItemViewCount] = listState.get()
//创建scala集合 存放数据
val buffer: ListBuffer[ItemViewCount] = ListBuffer[ItemViewCount]()
import scala.collection.JavaConversions._
for(i <- itemViewCounts){
buffer += i
}
//对窗口的数据排序 去TopN
val sortedItems: ListBuffer[ItemViewCount] = buffer.sortBy(_.count).reverse.take(size)
//将排名信息格式化成String,便于打印
val result: StringBuilder = new StringBuilder
result.append("=============================")
result.append("时间:").append(new Timestamp(timestamp-1)).append("\n")
for(i<-sortedItems.indices){
val currentItem: ItemViewCount = sortedItems(i)
result.append("No.").append(i+1).append(":")
.append("商品ID=").append(currentItem.itemId)
.append("浏览量=").append(currentItem.count).append("\n")
}
result.append("=========================")
Thread.sleep(1000)
out.collect(result.toString())
}
}