SparkDataSet 常用Api 整合
dtypes : 每一列信息
/**
* Returns all column names and their data types as an array.
*
* @group basic
* @since 1.6.0
*/
def dtypes: Array[(String, String)] = schema.fields.map { field =>
(field.name, field.dataType.toString)
}
columns:返回所有列
/**
* Returns all column names as an array.
*
* @group basic
* @since 1.6.0
*/
def columns: Array[String] = schema.fields.map(_.name)
isEmpty:判断是否为空
/**
* Returns true if the `Dataset` is empty.
*
* @group basic
* @since 2.4.0
*/
def isEmpty: Boolean = withAction("isEmpty", limit(1).groupBy().count().queryExecution) { plan =>
plan.executeCollect().head.getLong(0) == 0
}
na:删除包含Null值
/**
* Returns a [[DataFrameNaFunctions]] for working with missing data.
* {{{
* // Dropping rows containing any null values.
* ds.na.drop()
* }}}
*
* @group untypedrel
* @since 1.6.0
*/
def na: DataFrameNaFunctions = new DataFrameNaFunctions(toDF())
join:Join连接
/**
* Inner equi-join with another `DataFrame` using the given column.
*
* Different from other join functions, the join column will only appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
*
* {{{
* // Joining df1 and df2 using the column "user_id"
* df1.join(df2, "user_id")
* }}}
*
* @param right Right side of the join operation.
* @param usingColumn Name of the column to join on. This column must exist on both sides.
*
* @note If you perform a self-join using this function without aliasing the input
* `DataFrame`s, you will NOT be able to reference any columns after the join, since
* there is no way to disambiguate which side of the join you would like to reference.
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_], usingColumn: String): DataFrame = {
join(right, Seq(usingColumn))
}
def join(right: Dataset[_], usingColumns: Seq[String]): DataFrame = {
join(right, usingColumns, "inner")
}
/**
* Equi-join with another `DataFrame` using the given columns. A cross join with a predicate
* is specified as an inner join. If you would explicitly like to perform a cross join use the
* `crossJoin` method.
*
* Different from other join functions, the join columns will only appear once in the output,
* i.e. similar to SQL's `JOIN USING` syntax.
*
* @param right Right side of the join operation.
* @param usingColumns Names of the columns to join on. This columns must exist on both sides.
* @param joinType Type of join to perform. Default `inner`. Must be one of:
* `inner`, `cross`, `outer`, `full`, `full_outer`, `left`, `left_outer`,
* `right`, `right_outer`, `left_semi`, `left_anti`.
*
* @note If you perform a self-join using this function without aliasing the input
* `DataFrame`s, you will NOT be able to reference any columns after the join, since
* there is no way to disambiguate which side of the join you would like to reference.
*
* @group untypedrel
* @since 2.0.0
*/
def join(right: Dataset[_], usingColumns: Seq[String], joinType: String): DataFrame = {
// Analyze the self join. The assumption is that the analyzer will disambiguate left vs right
// by creating a new instance for one of the branch.
val joined = sparkSession.sessionState.executePlan(
Join(logicalPlan, right.logicalPlan, joinType = JoinType(joinType), None))
.analyzed.asInstanceOf[Join]
withPlan {
Join(
joined.left,
joined.right,
UsingJoin(JoinType(joinType), usingColumns),
None)
}
}
// example: df1.join(df2, $"df1Key" === $"df2Key", "outer")
def join(right: Dataset[_], joinExprs: Column, joinType: String): DataFrame = {
}
sortWithinPartitions
/**
* Returns a new Dataset with each partition sorted by the given expressions.
*
* This is the same operation as "SORT BY" in SQL (Hive QL).
*
* @group typedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def sortWithinPartitions(sortCol: String, sortCols: String*): Dataset[T] = {
sortWithinPartitions((sortCol +: sortCols).map(Column(_)) : _*)
}
/**
* Returns a new Dataset with each partition sorted by the given expressions.
*
* This is the same operation as "SORT BY" in SQL (Hive QL).
*
* @group typedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def sortWithinPartitions(sortExprs: Column*): Dataset[T] = {
sortInternal(global = false, sortExprs)
}
sort:排序
/**
* Returns a new Dataset sorted by the specified column, all in ascending order.
* {{{
* // The following 3 are equivalent
* ds.sort("sortcol")
* ds.sort($"sortcol")
* ds.sort($"sortcol".asc)
* }}}
*
* @group typedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def sort(sortCol: String, sortCols: String*): Dataset[T] = {
sort((sortCol +: sortCols).map(apply) : _*)
}
/**
* Returns a new Dataset sorted by the given expressions. For example:
* {{{
* ds.sort($"col1", $"col2".desc)
* }}}
*
* @group typedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def sort(sortExprs: Column*): Dataset[T] = {
sortInternal(global = true, sortExprs)
}
orderBy 排序
/**
* Returns a new Dataset sorted by the given expressions.
* This is an alias of the `sort` function.
*
* @group typedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def orderBy(sortCol: String, sortCols: String*): Dataset[T] = sort(sortCol, sortCols : _*)
/**
* Returns a new Dataset sorted by the given expressions.
* This is an alias of the `sort` function.
*
* @group typedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def orderBy(sortExprs: Column*): Dataset[T] = sort(sortExprs : _*)
hint :拓展hint
/**
* Specifies some hint on the current Dataset. As an example, the following code specifies
* that one of the plan can be broadcasted:
*
* {{{
* df1.join(df2.hint("broadcast"))
* }}}
*
* @group basic
* @since 2.2.0
*/
@scala.annotation.varargs
def hint(name: String, parameters: Any*): Dataset[T] = withTypedPlan {
UnresolvedHint(name, parameters, logicalPlan)
}
col :选择列信息
/**
* Selects column based on the column name and return it as a [[Column]].
*
* @note The column name can also reference to a nested column like `a.b`.
*
* @group untypedrel
* @since 2.0.0
*/
def col(colName: String): Column = colName match {
case "*" =>
Column(ResolvedStar(queryExecution.analyzed.output))
case _ =>
val expr = resolve(colName)
Column(expr)
}
as :别名
/**
* Returns a new Dataset with an alias set.
*
* @group typedrel
* @since 1.6.0
*/
def as(alias: String): Dataset[T] = withTypedPlan {
SubqueryAlias(alias, logicalPlan)
}
alias:别名
/**
* Returns a new Dataset with an alias set. Same as `as`.
*
* @group typedrel
* @since 2.0.0
*/
def alias(alias: String): Dataset[T] = as(alias)
select:select选择
/**
* Selects a set of column based expressions.
* {{{
* ds.select($"colA", $"colB" + 1)
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def select(cols: Column*): DataFrame = withPlan {
Project(cols.map(_.named), logicalPlan)
}
/**
* Selects a set of columns. This is a variant of `select` that can only select
* existing columns using column names (i.e. cannot construct expressions).
*
* {{{
* // The following two are equivalent:
* ds.select("colA", "colB")
* ds.select($"colA", $"colB")
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)) : _*)
toJSON:转Json
/**
* /**
* Returns the content of the Dataset as a Dataset of JSON strings.
* @since 2.0.0
*/
def toJSON: Dataset[String] = {
val rowSchema = this.schema
val sessionLocalTimeZone = sparkSession.sessionState.conf.sessionLocalTimeZone
val rdd: RDD[String] = queryExecution.toRdd.mapPartitions { iter =>
val writer = new CharArrayWriter()
// create the Generator without separator inserted between 2 records
val gen = new JacksonGenerator(rowSchema, writer,
new JSONOptions(Map.empty[String, String], sessionLocalTimeZone))
new Iterator[String] {
override def hasNext: Boolean = iter.hasNext
override def next(): String = {
gen.write(iter.next())
gen.flush()
val json = writer.toString
if (hasNext) {
writer.reset()
} else {
gen.close()
}
json
}
}
}
import sparkSession.implicits.newStringEncoder
sparkSession.createDataset(rdd)
}Filters rows using the given condition.
* {{{
* // The following are equivalent:
* peopleDs.filter($"age" > 15)
* peopleDs.where($"age" > 15)
* }}}
*
* @group typedrel
* @since 1.6.0
*/
filter:过滤
def filter(condition: Column): Dataset[T] = withTypedPlan {
Filter(condition.expr, logicalPlan)
}
/**
* Filters rows using the given SQL expression.
* {{{
* peopleDs.filter("age > 15")
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def filter(conditionExpr: String): Dataset[T] = {
filter(Column(sparkSession.sessionState.sqlParser.parseExpression(conditionExpr)))
}
where:过滤
/**
* Filters rows using the given condition. This is an alias for `filter`.
* {{{
* // The following are equivalent:
* peopleDs.filter($"age" > 15)
* peopleDs.where($"age" > 15)
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def where(condition: Column): Dataset[T] = filter(condition)
/**
* Filters rows using the given SQL expression.
* {{{
* peopleDs.where("age > 15")
* }}}
*
* @group typedrel
* @since 1.6.0
*/
def where(conditionExpr: String): Dataset[T] = {
filter(Column(sparkSession.sessionState.sqlParser.parseExpression(conditionExpr)))
}
groupBy :分组
/**
* Groups the Dataset using the specified columns, so we can run aggregation on them. See
* [[RelationalGroupedDataset]] for all the available aggregate functions.
*
* {{{
* // Compute the average for all numeric columns grouped by department.
* ds.groupBy($"department").avg()
*
* // Compute the max age and average salary, grouped by department and gender.
* ds.groupBy($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def groupBy(cols: Column*): RelationalGroupedDataset = {
RelationalGroupedDataset(toDF(), cols.map(_.expr), RelationalGroupedDataset.GroupByType)
}
/**
* Groups the Dataset using the specified columns, so that we can run aggregation on them.
* See [[RelationalGroupedDataset]] for all the available aggregate functions.
*
* This is a variant of groupBy that can only group by existing columns using column names
* (i.e. cannot construct expressions).
*
* {{{
* // Compute the average for all numeric columns grouped by department.
* ds.groupBy("department").avg()
*
* // Compute the max age and average salary, grouped by department and gender.
* ds.groupBy($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def groupBy(col1: String, cols: String*): RelationalGroupedDataset = {
val colNames: Seq[String] = col1 +: cols
RelationalGroupedDataset(
toDF(), colNames.map(colName => resolve(colName)), RelationalGroupedDataset.GroupByType)
}
rollup:聚合
/**
* Create a multi-dimensional rollup for the current Dataset using the specified columns,
* so we can run aggregation on them.
* See [[RelationalGroupedDataset]] for all the available aggregate functions.
*
* {{{
* // Compute the average for all numeric columns rolluped by department and group.
* ds.rollup($"department", $"group").avg()
*
* // Compute the max age and average salary, rolluped by department and gender.
* ds.rollup($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def rollup(cols: Column*): RelationalGroupedDataset = {
RelationalGroupedDataset(toDF(), cols.map(_.expr), RelationalGroupedDataset.RollupType)
}
cube:聚合
/**
* Create a multi-dimensional cube for the current Dataset using the specified columns,
* so we can run aggregation on them.
* See [[RelationalGroupedDataset]] for all the available aggregate functions.
*
* {{{
* // Compute the average for all numeric columns cubed by department and group.
* ds.cube($"department", $"group").avg()
*
* // Compute the max age and average salary, cubed by department and gender.
* ds.cube($"department", $"gender").agg(Map(
* "salary" -> "avg",
* "age" -> "max"
* ))
* }}}
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def cube(cols: Column*): RelationalGroupedDataset = {
RelationalGroupedDataset(toDF(), cols.map(_.expr), RelationalGroupedDataset.CubeType)
}
reduce:聚合
/**
* :: Experimental ::
* (Scala-specific)
* Reduces the elements of this Dataset using the specified binary function. The given `func`
* must be commutative and associative or the result may be non-deterministic.
*
* @group action
* @since 1.6.0
*/
@Experimental
@InterfaceStability.Evolving
def reduce(func: (T, T) => T): T = rdd.reduce(func)
/**
* :: Experimental ::
* (Java-specific)
* Reduces the elements of this Dataset using the specified binary function. The given `func`
* must be commutative and associative or the result may be non-deterministic.
*
* @group action
* @since 1.6.0
*/
@Experimental
@InterfaceStability.Evolving
def reduce(func: ReduceFunction[T]): T = reduce(func.call(_, _))
intersect:交集
/**
* Returns a new Dataset containing rows only in both this Dataset and another Dataset.
* This is equivalent to `INTERSECT` in SQL.
*
* @note Equality checking is performed directly on the encoded representation of the data
* and thus is not affected by a custom `equals` function defined on `T`.
*
* @group typedrel
* @since 1.6.0
*/
def intersect(other: Dataset[T]): Dataset[T] = withSetOperator {
Intersect(logicalPlan, other.logicalPlan)
}
except:差集
/**
* Returns a new Dataset containing rows in this Dataset but not in another Dataset.
* This is equivalent to `EXCEPT` in SQL.
*
* @note Equality checking is performed directly on the encoded representation of the data
* and thus is not affected by a custom `equals` function defined on `T`.
*
* @group typedrel
* @since 2.0.0
*/
def except(other: Dataset[T]): Dataset[T] = withSetOperator {
Except(logicalPlan, other.logicalPlan)
}
withColumn:新增列/替换列
/**
* Returns a new Dataset by adding a column or replacing the existing column that has
* the same name.
*
* @group untypedrel
* @since 2.0.0
*/
def withColumn(colName: String, col: Column): DataFrame = {
val resolver = sparkSession.sessionState.analyzer.resolver
val output = queryExecution.analyzed.output
val shouldReplace = output.exists(f => resolver(f.name, colName))
if (shouldReplace) {
val columns = output.map { field =>
if (resolver(field.name, colName)) {
col.as(colName)
} else {
Column(field)
}
}
select(columns : _*)
} else {
select(Column("*"), col.as(colName))
}
}
drop:删除列
/**
* Returns a new Dataset with a column dropped. This is a no-op if schema doesn't contain
* column name.
*
* This method can only be used to drop top level columns. the colName string is treated
* literally without further interpretation.
*
* @group untypedrel
* @since 2.0.0
*/
def drop(colName: String): DataFrame = {
drop(Seq(colName) : _*)
}
/**
* Returns a new Dataset with columns dropped.
* This is a no-op if schema doesn't contain column name(s).
*
* This method can only be used to drop top level columns. the colName string is treated literally
* without further interpretation.
*
* @group untypedrel
* @since 2.0.0
*/
@scala.annotation.varargs
def drop(colNames: String*): DataFrame = {
val resolver = sparkSession.sessionState.analyzer.resolver
val allColumns = queryExecution.analyzed.output
val remainingCols = allColumns.filter { attribute =>
colNames.forall(n => !resolver(attribute.name, n))
}.map(attribute => Column(attribute))
if (remainingCols.size == allColumns.size) {
toDF()
} else {
this.select(remainingCols: _*)
}
}
describe:基础统计信息
/**
* Computes statistics for numeric and string columns, including count, mean, stddev, min, and
* max. If no columns are given, this function computes statistics for all numerical or string
* columns.
*
* This function is meant for exploratory data analysis, as we make no guarantee about the
* backward compatibility of the schema of the resulting Dataset. If you want to
* programmatically compute summary statistics, use the `agg` function instead.
*
* {{{
* ds.describe("age", "height").show()
*
* // output:
* // summary age height
* // count 10.0 10.0
* // mean 53.3 178.05
* // stddev 11.6 15.7
* // min 18.0 163.0
* // max 92.0 192.0
* }}}
*
* @group action
* @since 1.6.0
*/
@scala.annotation.varargs
def describe(cols: String*): DataFrame = withPlan {
// The list of summary statistics to compute, in the form of expressions.
val statistics = List[(String, Expression => Expression)](
"count" -> ((child: Expression) => Count(child).toAggregateExpression()),
"mean" -> ((child: Expression) => Average(child).toAggregateExpression()),
"stddev" -> ((child: Expression) => StddevSamp(child).toAggregateExpression()),
"min" -> ((child: Expression) => Min(child).toAggregateExpression()),
"max" -> ((child: Expression) => Max(child).toAggregateExpression()))
val outputCols =
(if (cols.isEmpty) aggregatableColumns.map(usePrettyExpression(_).sql) else cols).toList
val ret: Seq[Row] = if (outputCols.nonEmpty) {
val aggExprs = statistics.flatMap { case (_, colToAgg) =>
outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c))
}
val row = groupBy().agg(aggExprs.head, aggExprs.tail: _*).head().toSeq
// Pivot the data so each summary is one row
row.grouped(outputCols.size).toSeq.zip(statistics).map { case (aggregation, (statistic, _)) =>
Row(statistic :: aggregation.toList: _*)
}
} else {
// If there are no output columns, just output a single column that contains the stats.
statistics.map { case (name, _) => Row(name) }
}
// All columns are string type
val schema = StructType(
StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes
// `toArray` forces materialization to make the seq serializable
LocalRelation.fromExternalRows(schema, ret.toArray.toSeq)
}
head:前几列
/**
* Returns the first `n` rows.
*
* @note this method should only be used if the resulting array is expected to be small, as
* all the data is loaded into the driver's memory.
*
* @group action
* @since 1.6.0
*/
def head(n: Int): Array[T] = withAction("head", limit(n).queryExecution)(collectFromPlan)
/**
* Returns the first row.
* @group action
* @since 1.6.0
*/
def head(): T = head(1).head
/**
* Returns the first row. Alias for head().
* @group action
* @since 1.6.0
*/
def first(): T = head()