一、环境版本
环境 | 版本 |
---|---|
Flink | 1.17.0 |
Kafka | 2.12 |
MySQL | 5.7.33 |
二、MySQL建表脚本
create table user_log
(
id int auto_increment comment '主键'
primary key,
uid int not null comment '用户id',
event int not null comment '用户行为',
logtime bigint null comment '日志时间'
)
comment '用户日志表,作为验证数据源';
三、用户日志类
新建maven项目
用以定义Kafka和MySQL中Schema
/**
* 用户日志类
*/
@Data
public class UserLog {
//用户uid
private int uid;
//用户行为
private int event;
//日志时间
private Date logtime;
}
四、用户数据生成器
/**
* 用户数据生成器
*/
public class UserLogGenerator {
public static void main(String[] args) throws Exception {
// 1.获取执行环境
StreamExecutionEnvironment env =
StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 2.自定义数据生成器Source
DataGeneratorSource<UserLog> dataGeneratorSource = new DataGeneratorSource<>(
// 指定GeneratorFunction 实现类
new GeneratorFunction<Long, UserLog>(){
// 定义随机数数据生成器
public RandomDataGenerator generator;
@Override
public void open(SourceReaderContext readerContext) throws Exception {
generator = new RandomDataGenerator();
}
@Override
public UserLog map(Long aLong) throws Exception {
UserLog userLog = new UserLog();
//随机生成用户uid
userLog.setUid(generator.nextInt(1, 100000));
//随机生成用户行为
userLog.setEvent(generator.nextInt(1, 2));
//随机生成用户数据时间
userLog.setLogtime(DateUtil.offset(new DateTime(), DateField.MILLISECOND, generator.nextInt(-2000, 2000)));
return userLog;
}
},
// 指定输出数据的总行数
60 * 60 * 10,
// 指定每秒发射的记录数
RateLimiterStrategy.perSecond(10),
// 指定返回值类型, 将Java的StockPrice封装成到TypeInformation
TypeInformation.of(UserLog.class)
);
DataStreamSource<UserLog> dataGeneratorSourceStream = env.fromSource(dataGeneratorSource, WatermarkStrategy.noWatermarks(), "dataGeneratorSource");
//输出生成数据
// dataGeneratorSourceStream.print();
//kafka数据写入
KafkaSink<UserLog> kafkaSink = KafkaSink.<UserLog>builder()
.setBootstrapServers("hadoop01:9092")
.setRecordSerializer(
KafkaRecordSerializationSchema.<UserLog>builder()
.setTopic("userLog")
.setValueSerializationSchema((SerializationSchema<UserLog>) userLog -> JSONUtil.toJsonStr(userLog).getBytes())
.build()
).build();
dataGeneratorSourceStream.sinkTo(kafkaSink);
//MySQL数据写入,用以数据验证
SinkFunction<UserLog> jdbcSink = JdbcSink.sink(
"insert into user_log (uid, event, logtime) values (?, ?, ?)",
new JdbcStatementBuilder<UserLog>() {
@Override
public void accept(PreparedStatement preparedStatement, UserLog userLog) throws SQLException {
preparedStatement.setInt(1, userLog.getUid());
preparedStatement.setInt(2, userLog.getEvent());
preparedStatement.setLong(3, userLog.getLogtime().getTime());
}
}
,
JdbcExecutionOptions.builder()
.withBatchSize(1000)
.withBatchIntervalMs(200)
.withMaxRetries(5)
.build(),
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withUrl("jdbc:mysql://localhost:3306/demo")
.withDriverName("com.mysql.cj.jdbc.Driver")
.withUsername("你的用户名")
.withPassword("你的密码")
.build()
);
dataGeneratorSourceStream.addSink(jdbcSink);
env.execute();
}
}
五、TableAPI 10秒钟内用户的访问量
/**
* 10秒钟内用户的访问量
*/
public class UserLogCount {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tableEnv = StreamTableEnvironment.create(env);
env.setParallelism(1);
//1.定义table的schema
final Schema schema = Schema.newBuilder()
.column("uid", DataTypes.INT())
.column("event", DataTypes.INT())
.column("logtime", DataTypes.BIGINT())
//将logtime转换为flink使用的timsstamp格式
.columnByExpression("rowtime", "TO_TIMESTAMP_LTZ(logtime, 3)")
//定义水位线
.watermark("rowtime", "rowtime - INTERVAL '5' SECOND")
.build();
//2.创建Kafka source table
tableEnv.createTable("user_log", TableDescriptor.forConnector("kafka")
.schema(schema)
.format("json")
// .option("json.timestamp-format.standard", "ISO-8601")
.option("json.ignore-parse-errors", "true")
.option("topic", "userLog")
.option("properties.bootstrap.servers", "hadoop01:9092")
.option("scan.startup.mode", "latest-offset")
.build());
//3.创建一个滚动窗口表
Table pvTable = tableEnv.from("user_log")
//定义一个10秒钟的滚动窗口
.window(Tumble.over(lit(10).seconds()).on($("rowtime")).as("w"))
.groupBy($("w"))
.select(
$("w").start().as("w_start"),
$("w").end().as("w_end"),
//$("uid").count().distinct().as("uv")),
$("uid").count().as("pv"));
pvTable.execute().print();
}
}
六、数据验证
- 启动 UserLogGenerator
- 启动 UserLogCount
+----+-------------------------+-------------------------+----------------------+
| op | w_start | w_end | pv |
+----+-------------------------+-------------------------+----------------------+
| +I | 2025-08-11 15:11:50.000 | 2025-08-11 15:12:00.000 | 10 |
| +I | 2025-08-11 15:12:00.000 | 2025-08-11 15:12:10.000 | 95 |
| +I | 2025-08-11 15:12:10.000 | 2025-08-11 15:12:20.000 | 104 |
| +I | 2025-08-11 15:12:20.000 | 2025-08-11 15:12:30.000 | 104 |
| +I | 2025-08-11 15:12:30.000 | 2025-08-11 15:12:40.000 | 94 |
| +I | 2025-08-11 15:12:40.000 | 2025-08-11 15:12:50.000 | 104 |
| +I | 2025-08-11 15:12:50.000 | 2025-08-11 15:13:00.000 | 96 |
| +I | 2025-08-11 15:13:00.000 | 2025-08-11 15:13:10.000 | 100 |
- 在MySQL中验证查询
选取数据
+----+-------------------------+-------------------------+----------------------+
| op | w_start | w_end | pv |
+----+-------------------------+-------------------------+----------------------+
| +I | 2025-08-11 15:12:50.000 | 2025-08-11 15:13:00.000 | 96
转换时间戳
时间戳 | 转换前 | 转换后 |
---|---|---|
w_start | 2025-08-11 15:12:50.000 | 1754896370000 |
w_end | 2025-08-11 15:13:00.000 | 1754896380000 |
MySQL中查询
# 输出96与Flink结果一致
select count(*)
from user_log
where logtime>= 1754896370000
and logtime < 1754896380000;
七、POM文件
<project>
<groupId>dblab</groupId>
<artifactId>demo</artifactId>
<modelVersion>4.0.0</modelVersion>
<name> </name>
<packaging>jar</packaging>
<version>1.0</version>
<repositories>
<repository>
<id>central-repos</id>
<name>Central Repository</name>
<url>http://repo.maven.apache.org/maven2</url>
</repository>
<repository>
<id>alimaven</id>
<name>aliyun maven</name>
<url>https://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
</repositories>
<properties>
<flink.version>1.17.0</flink.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-connector-files</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-datagen</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-loader</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-runtime</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-connector-files</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- <dependency>-->
<!-- <groupId>org.apache.flink</groupId>-->
<!-- <artifactId>flink-csv</artifactId>-->
<!-- <version>${flink.version}</version>-->
<!-- </dependency>-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-jdbc</artifactId>
<version>3.1.1-1.17</version>
</dependency>
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>8.0.33</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.26</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.39</version>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
八、常见问题
8.1 未定义水位线
Exception in thread "main" org.apache.flink.table.api.ValidationException: A group window expects a time attribute for grouping in a stream environment.
at org.apache.flink.table.operations.utils.AggregateOperationFactory.validateStreamTimeAttribute(AggregateOperationFactory.java:327)
at org.apache.flink.table.operations.utils.AggregateOperationFactory.validateTimeAttributeType(AggregateOperationFactory.java:307)
at org.apache.flink.table.operations.utils.AggregateOperationFactory.getValidatedTimeAttribute(AggregateOperationFactory.java:300)
at org.apache.flink.table.operations.utils.AggregateOperationFactory.createResolvedWindow(AggregateOperationFactory.java:265)
at org.apache.flink.table.operations.utils.OperationTreeBuilder.windowAggregate(OperationTreeBuilder.java:262)
at org.apache.flink.table.api.internal.TableImpl$WindowGroupedTableImpl.select(TableImpl.java:641)
at UserLogCount.main(UserLogCount.java:42)
当TableAPI中未定义水位线时,会导致Flink无法识别窗口的时间戳
//定义水位线
.watermark("rowtime", "rowtime - INTERVAL '5' SECOND")