前言
解析SQL的工具很多,我这里使用的是CCJSqlParserUtil。但是这个方案的局限性比较多,扩展性会比较差,如果对血缘分析有更高要求的话可以去试试Antlr4来处理,Antlr4可以扩展以适配更多不同类型的数据库。
背景介绍和思路
我这边的需求是获取数据集中字段的血缘,这个数据集本质是一段SELECT查询语句,现在需要获取被查询的字段是由哪些表、哪些字段构成。没有大数据背景,数据都是存在MYSQL或者PG数据库中。我这边想到的方式是通过一层层对SQL语法的解析获取需要的信息构建血缘关系。但是单纯的解析语法肯定无法满足一些情况,对于存在多个Table没有别名分辨字段来源的SQL,解析语法无法分清字段来源于具体哪个Table,例如:
select role_id from sys_user su left join sys_user_role sur on su.id=sur.user_id
这种时候就无法通过解析SQL语法得知role_id究竟属于sys_user表还是sys_user_role表。这个时候就需要连接数据库查询Table存在哪些字段,然后比对判断具体来源。
演示代码
我这里考虑了SQL中存在UNION ALL、WITH、函数、子查询等等情况的存在,应该是满足了常见的SELECT语句的解析,但也可能很多地方没有想到,希望大家可以在评论区里补充。
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
import net.sf.jsqlparser.JSQLParserException;
import net.sf.jsqlparser.expression.*;
import net.sf.jsqlparser.expression.operators.relational.*;
import net.sf.jsqlparser.parser.CCJSqlParserUtil;
import net.sf.jsqlparser.schema.Column;
import net.sf.jsqlparser.schema.Table;
import net.sf.jsqlparser.statement.select.*;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.junit4.SpringRunner;
import java.util.*;
@RunWith(SpringRunner.class)
@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT)
@Slf4j
public class SQLLineageExtractor {
// 血缘关系模型类
@Data
public static class FieldLineage {
private String fieldName; // 字段名称
private String tableName; // 存储字段所在的表(通过表别名或表名识别)
private String functionName; // 存储字段所在的表(通过表别名或表名识别)
}
@Data
public static class TableLineage {
private String tableName;
private String aliasName;
private FromItem fromItem;
}
// private String mainFieldName;
private List<WithItem> withItemsList ;
@Test
public void getRelation() {
String sql = "SELECT X.USERNAME FROM (SELECT SU.USERNAME FROM SYS_USER SU UNION ALL SELECT SUB.REALNAME FROM SYS_USER_BAK SUB ) X";
sql="WITH ABC (SELECT SU.USERNAME FROM SYS_USER) SELECT USERNAME FROM ABC";
sql="SELECT SU.USERNAME , SURB.ROLENAME FROM SYS_USER SU LEFT JOIN (SELECT SUR.ROLENAME,SUR.USER_ID FROM SYS_USER_ROLE SUR)SURB ON SURB.USER_ID=SU.ID";
// 提取字段血缘信息
Map<String, List<FieldLineage>> stringListMap = extractLineage(sql);
for (String s : stringListMap.keySet()) {
System.out.println(s);
List<FieldLineage> fieldLineages = stringListMap.get(s);
System.out.println(fieldLineages);
}
}
// 提取字段的血缘关系
public Map<String, List<FieldLineage>> extractLineage(String sql) {
// 解析 SQL
Select select = null;
try {
select = (Select) CCJSqlParserUtil.parse(sql);
} catch (JSQLParserException e) {
e.printStackTrace();
}
if (select == null) {
return null;
}
Map<String, List<FieldLineage>> mainLineageMap = new LinkedHashMap<>();
// 获取语句中的With,例:WITH user (SELECT * FROM sys_user) SELECT * FROM user
withItemsList = select.getWithItemsList();
// 存在UNION ALL之类的会进入下面
if (select.getSelectBody() instanceof SetOperationList) {
SetOperationList setOperationList = (SetOperationList) select.getSelectBody();
List<SelectBody> selects = setOperationList.getSelects();
for (int i = 0; i < selects.size(); i++) {
SelectBody selectBody = selects.get(i);
if (i == 0) {
mainLineageMap = extractLineage(selectBody.toString());
} else {
Map<String, List<FieldLineage>> otherLineageMap = extractLineage(selectBody.toString());
Set<String> mainKeySet = mainLineageMap.keySet();
Set<String> otherKeySet = otherLineageMap.keySet();
// UNION ALL 的时候好像会按第一段SQL的字段名称显示,后面的一段SQL字段名称不一样的时候会和前面的字段按顺序一一对应,所以这里解析查询血缘的时候需要处理配对下
int mainIndex = 0;
for (String mainKey : mainKeySet) {
int otherIndex = 0;
for (String otherKey : otherKeySet) {
if (mainIndex==otherIndex){
List<FieldLineage> mainFieldLineages = mainLineageMap.get(mainKey);
List<FieldLineage> otherFieldLineages = otherLineageMap.get(otherKey);
mainFieldLineages.addAll(otherFieldLineages);
mainLineageMap.put(mainKey,mainFieldLineages);
}
otherIndex=otherIndex+1;
}
mainIndex = mainIndex + 1;
}
}
}
} else {
// 普通SQL的解析
PlainSelect plainSelect = (PlainSelect) select.getSelectBody();
List<TableLineage> tableLineages = new ArrayList<>();
// 处理 FROM 子句中的表(可能是子查询)
handleFromItem(plainSelect.getFromItem(), tableLineages);
// 处理 JOIN 子句
if (plainSelect.getJoins() != null) {
for (Join join : plainSelect.getJoins()) {
handleFromItem(join.getRightItem(), tableLineages);
}
}
// 处理 SELECT 语句的字段
for (SelectItem item : plainSelect.getSelectItems()) {
if (item instanceof SelectExpressionItem) {
SelectExpressionItem selectExpressionItem = (SelectExpressionItem) item;
Expression expr = selectExpressionItem.getExpression();
Alias alias = selectExpressionItem.getAlias();
String columnName = selectExpressionItem.toString();
if (expr instanceof Column) {
Column column = (Column) expr;
columnName = column.getColumnName();
}
columnName = alias != null ? alias.getName() : columnName;
// this.mainFieldName = columnName;
List<FieldLineage> fieldLineages = handleSelectItem(expr, tableLineages);
mainLineageMap.put(columnName, fieldLineages);
} else if (item instanceof AllColumns) {
// 处理 SELECT * (所有列)
AllColumns allColumns = (AllColumns) item;
// 遍历所有已知的表并获取所有字段
for (TableLineage tableLineage : tableLineages) {
FromItem fromItem = tableLineage.getFromItem();
List<String> fieldList = executeSql(fromItem); // 获取该表的字段列表
// 遍历所有字段
for (String field : fieldList) {
FieldLineage fieldLineage = new FieldLineage();
fieldLineage.setTableName(tableLineage.getTableName());
fieldLineage.setFieldName(field);
// 将字段的血缘信息添加到 map 中
mainLineageMap.computeIfAbsent(field, k -> new ArrayList<>()).add(fieldLineage);
}
}
} else if (item instanceof AllTableColumns) {
// 处理 SELECT table.* (某个表的所有列)
AllTableColumns allTableColumns = (AllTableColumns) item;
String tableName = allTableColumns.getTable().getName();
// 通过 tableName 获取表的字段
for (TableLineage tableLineage : tableLineages) {
if (tableLineage.getTableName().equals(tableName) || tableLineage.getAliasName().equals(tableName)) {
FromItem fromItem = tableLineage.getFromItem();
List<String> fieldList = executeSql(fromItem); // 获取该表的字段列表
// 遍历该表的所有字段
for (String field : fieldList) {
FieldLineage fieldLineage = new FieldLineage();
fieldLineage.setTableName(tableName);
fieldLineage.setFieldName(field);
// 将字段的血缘信息添加到 map 中
mainLineageMap.computeIfAbsent(field, k -> new ArrayList<>()).add(fieldLineage);
}
}
}
}
}
}
return mainLineageMap;
}
private List<FieldLineage> handleSelectItem(Expression expr, List<TableLineage> tableLineages) {
List<FieldLineage> fieldLineageList = new ArrayList<>();
if (null == expr) {
return fieldLineageList;
}
if (expr instanceof Column) {
// 直接字段依赖
Column column = (Column) expr;
Table table = column.getTable();
String columnName = column.getColumnName();
if (table == null) {
for (TableLineage tableLineage : tableLineages) {
FromItem fromItem = tableLineage.getFromItem();
List<String> fieldList = executeSql(fromItem);
if (fieldList.contains(columnName)) {
if (fromItem instanceof Table) {
Table tableFrom = (Table) fromItem;
String tableName = tableFrom.getName();
FieldLineage fieldLineage = new FieldLineage();
fieldLineage.setTableName(tableName);
fieldLineage.setFieldName(columnName);
fieldLineageList.add(fieldLineage);
} else if (fromItem instanceof SubSelect) {
SubSelect subSelect = (SubSelect) fromItem;
Map<String, List<FieldLineage>> listMap = extractLineage(subSelect.getSelectBody().toString());
List<FieldLineage> fieldLineages = listMap.get(columnName);
if (null != fieldLineages) {
fieldLineageList.addAll(fieldLineages);
}
}
break;
}
}
} else {
String tableName = table.getName();
if (null != table.getAlias()) {
tableName = table.getAlias().getName();
}
for (TableLineage tableLineage : tableLineages) {
if (tableName.equals(tableLineage.getTableName()) || tableName.equals(tableLineage.getAliasName())) {
FromItem fromItem = tableLineage.getFromItem();
if (fromItem instanceof Table) {
Table tableFrom = (Table) fromItem;
String formTableName = tableFrom.getName();
FieldLineage fieldLineage = new FieldLineage();
fieldLineage.setTableName(formTableName);
fieldLineage.setFieldName(columnName);
fieldLineageList.add(fieldLineage);
} else if (fromItem instanceof SubSelect) {
SubSelect subSelect = (SubSelect) fromItem;
Map<String, List<FieldLineage>> listMap = extractLineage(subSelect.getSelectBody().toString());
List<FieldLineage> fieldLineages = listMap.get(columnName);
if (null != fieldLineages) {
fieldLineageList.addAll(fieldLineages);
}
}
}
}
}
} else if (expr instanceof SubSelect) {
// 子查询
SubSelect subSelect = (SubSelect) expr;
String columnName = subSelect.toString();
if (null != subSelect.getAlias()) {
columnName = subSelect.getAlias().getName();
}
Map<String, List<FieldLineage>> listMap = extractLineage(subSelect.getSelectBody().toString());
for (String column : listMap.keySet()) {
List<FieldLineage> fieldLineages = listMap.get(column);
fieldLineageList.addAll(fieldLineages);
}
} else if (expr instanceof BinaryExpression) {
// 二元运算表达式
BinaryExpression binaryExpression = (BinaryExpression) expr;
Expression leftExpression = binaryExpression.getLeftExpression();
List<FieldLineage> fieldLeftLineages = handleSelectItem(leftExpression, tableLineages);
fieldLineageList.addAll(fieldLeftLineages);
Expression rightExpression = binaryExpression.getRightExpression();
List<FieldLineage> fieldRightLineages = handleSelectItem(rightExpression, tableLineages);
fieldLineageList.addAll(fieldRightLineages);
} else if (expr instanceof Function) {
Function function = (Function) expr;
ExpressionList parameters = function.getParameters();
if (parameters == null) {
// TODO:当方法为count,now()之类的,字段的血缘关系该如何展示?
// 处理无参数的聚合函数(例如 COUNT(*))
if (function.getName().equalsIgnoreCase("COUNT") ||
function.getName().equalsIgnoreCase("SUM") ||
function.getName().equalsIgnoreCase("AVG") ||
function.getName().equalsIgnoreCase("MAX") ||
function.getName().equalsIgnoreCase("MIN") ||
function.getName().equalsIgnoreCase("NOW") ||
function.getName().equalsIgnoreCase("CURRENT_DATE")) {
// 假设聚合函数是对某个表的字段进行操作
TableLineage tableLineage = tableLineages.get(0);//将第一张表设置为主表
String tableName = tableLineage.getTableName();
FieldLineage fieldLineage = new FieldLineage();
// fieldLineage.setFieldName(mainFieldName);
fieldLineage.setTableName(tableName);
fieldLineage.setFunctionName(function.getName()); // 记录函数名
fieldLineageList.add(fieldLineage);
}
} else {
// 例如COUNT(1) ,parameters不为空,但也没有具体的字段血缘
boolean flag = false;
Expression itemExpression = parameters.getExpressions().get(0);
if (itemExpression instanceof LongValue || itemExpression instanceof StringValue) {
if (parameters.getExpressions().size() == 1) {
flag = true;
}
}
if (flag) {
TableLineage tableLineage = tableLineages.get(0);//将第一张表设置为主表
String tableName = tableLineage.getTableName();
FieldLineage fieldLineage = new FieldLineage();
// fieldLineage.setFieldName(mainFieldName);
fieldLineage.setTableName(tableName);
fieldLineage.setFunctionName(function.getName()); // 记录函数名
fieldLineageList.add(fieldLineage);
} else {
for (Expression expression : parameters.getExpressions()) {
List<FieldLineage> fieldLineages = handleSelectItem(expression, tableLineages);
fieldLineageList.addAll(fieldLineages);
}
}
}
} else if (expr instanceof CaseExpression) {
CaseExpression caseExpr = (CaseExpression) expr;
for (WhenClause whenClause : caseExpr.getWhenClauses()) {
List<FieldLineage> fieldLineages = handleSelectItem(whenClause, tableLineages);
fieldLineageList.addAll(fieldLineages);
}
Expression elseExpression = caseExpr.getElseExpression();
List<FieldLineage> fieldElseLineages = handleSelectItem(elseExpression, tableLineages);
fieldLineageList.addAll(fieldElseLineages);
Expression switchExpression = caseExpr.getSwitchExpression();
List<FieldLineage> fieldSwitchLineages = handleSelectItem(switchExpression, tableLineages);
fieldLineageList.addAll(fieldSwitchLineages);
} else if (expr instanceof WhenClause) {
WhenClause whenClause = (WhenClause) expr;
List<FieldLineage> fieldWhenLineages = handleSelectItem(whenClause.getWhenExpression(), tableLineages);
fieldLineageList.addAll(fieldWhenLineages);
List<FieldLineage> fieldThenLineages = handleSelectItem(whenClause.getThenExpression(), tableLineages);
fieldLineageList.addAll(fieldThenLineages);
} else if (expr instanceof Parenthesis) {
// 处理括号表达式
Parenthesis parenthesis = (Parenthesis) expr;
List<FieldLineage> fieldLineages = handleSelectItem(parenthesis.getExpression(), tableLineages);
fieldLineageList.addAll(fieldLineages);
} else if (expr instanceof MySQLGroupConcat) {
// 处理group_concat
MySQLGroupConcat mySQLGroupConcat = (MySQLGroupConcat) expr;
ExpressionList expressionList = mySQLGroupConcat.getExpressionList();
for (Expression expression : expressionList.getExpressions()) {
List<FieldLineage> fieldLineages = handleSelectItem(expression, tableLineages);
fieldLineageList.addAll(fieldLineages);
}
} else if (expr instanceof LongValue || expr instanceof StringValue) {
// TODO:常量不处理
} else {
Class<? extends Expression> aClass = expr.getClass();
System.out.println("Class.getSimpleName():" + aClass.getSimpleName());
System.out.println("Class.getName():" + aClass.getName());
}
return fieldLineageList;
}
// 执行SQL确定字段来源
private List<String> executeSql(FromItem fromItem) {
List<String> fieldList = new ArrayList<>();
if (fromItem instanceof Table) {
// TODO:这里执行查询table的SQL语句,获取table中的字段
} else if (fromItem instanceof SubSelect) {
// TODO:这里执行查询SQL语句,获取SQL中的字段
}
return fieldList;
}
// 处理 FROM 子句中的项,包括子查询
private void handleFromItem(FromItem fromItem, List<TableLineage> tableLineages) {
TableLineage fromFieldLineage = new TableLineage();
if (fromItem.getAlias() != null) {
fromFieldLineage.setAliasName(fromItem.getAlias().getName());
}
if (fromItem instanceof Table) {
// 普通的表
Table table = (Table) fromItem;
fromFieldLineage.setTableName(table.getName());
fromFieldLineage.setFromItem(table);
} else if (fromItem instanceof SubSelect) {
// 子查询
SubSelect subSelect = (SubSelect) fromItem;
fromFieldLineage.setFromItem(subSelect);
}
if (null!=withItemsList){
for (WithItem withItem : withItemsList) {
if (fromFieldLineage.getTableName().equals(withItem.getName())){
// 获取WITH子句中的查询部分(即子查询)
SelectBody subQuery = withItem.getSelectBody();
// 将子查询转换为FromItem
FromItem newFromItem = new SubSelect();
((SubSelect) newFromItem).setSelectBody(subQuery);
fromFieldLineage.setFromItem(newFromItem);
}
}
}
tableLineages.add(fromFieldLineage);
}
}
效果展示
代码执行可能存在的问题
如果你测试的时候字段下面没有血缘信息的话,要么测试的SQL给字段前面加上 表别名. 要么将下面图片上连接数据库查询SQL的部分补全,返回的内容就是当前SQL执行查出来的字段。
建议最好给执行的SQL加上分页限定查询一条,以免浪费资源。
总结
这是一段不成熟的Demo,希望大家可以在基于使用CCJSqlParserUtil的基础上提一些建议。当然如果在实践过程中还发现有别的不错的方案希望也能在评论区分享。