#coding=utf-8
from pyspark.sql import SparkSession, HiveContext
spark = SparkSession.builder.enableHiveSupport().getOrCreate()
def getTablelist(hive_database):
#定义一个list存放表名
tableNameList = []
hive_read_sql = "SHOW TABLES FROM {0}".format(hive_database)
tables_df = spark.sql(hive_read_sql)
#获取表名
rows = tables_df.select('tableName').collect()
# rrrrr [Row(tableName='ab'), Row(tableName='ac'), Row(tableName='ad'), Row(tableName='ao_opanomaly'), Row(tableName='automaticfiredevicealarm')
#解析表名
tables = [[row.tableName] for row in rows]
#这个for循环的意思是将表名遍历到list中
for tableName in tables:
table_name = tableName[0]
tableNameList.append(table_name)
return tableNameList
def getTableNameAndRows(tableNameList,hive_database):
#定义一个datalist用来存放表名和表行数
dataList = []
#这个for循环的意思是将SQL语句拼接形成统计表行数的SQL
for tableName in tableNameList:
# print("cccccccccc",tableName)
analyzeSql = "analyze table {0}.{1} compute statistics".format(hive_database,tableName)
spark.sql(analyzeSql)
getTableRowsSql = "desc formatted {0}.{1}".format(hive_database,tableName)
dataDf = spark.sql(getTableRowsSql)
data = dataDf.select("col_name","data_type").collect()
for row in data:
if (row.col_name == "Statistics"):
tableRows = {'tableName':tableName,'rows':row.data_type}
# print("tableRows",tableRows)
dataList.append(tableRows)
return dataList
def get_insert_sql(hive_database,target_table):
sql = """INSERT OVERWRITE TABLE {}.{target_table}
SELECT tableName
,rows
FROM tmp_{target_table}
""".format(hive_database = hive_database,target_table = target_table)
return sql
def dataInsertTable(hive_database,target_table):
#生成spark的DataFrame
df = spark.createDataFrame(dataList,["tableName", "rows"])
#通过DataFrame创建临时表
df.createOrReplaceTempView('tmp_{target_table}'.format(target_table = target_table))
sql = get_insert_sql(hive_database,target_table)
#临时表插入正式表
spark.sql(sql)
if __name__ == '__main__':
hive_database = "campus_business_system"
#获取该库下所有表名
tableNameList = getTablelist(hive_database)
#获取表名和表行数
getTableNameAndRows(tableNameList,hive_database)
#数据入库
dataInsertTable(hive_database,'table_rows')
# CREATE TABLE IF NOT EXISTS `table_rows`(
# `tableName` string COMMENT'表名',
# `rows` string COMMENT'表行数'
# )comment'获取表行数'stored as orc
# lifecycle 9999;
PySpark获取hive库中的表行数
最新推荐文章于 2024-08-10 10:44:31 发布