Spark DataFrame (1)
Spark DataFrame (1)
spark = SparkSession.builder.appName("nik").getOrCreate()
In [18]: print(people)
In [22]: print(schemaPeople)
In [75]: schemaPeople.show()
+--------+---+
| name|age|
+--------+---+
| Ankit| 25|
|Jalfaizy| 22|
| saurabh| 20|
| Bala| 26|
+--------+---+
In [23]: schemaPeople.collect()
In [26]: type(data)
Out[26]: list
root
|-- Category: string (nullable = true)
|-- ID: long (nullable = true)
|-- Truth: boolean (nullable = true)
|-- Value: double (nullable = true)
+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
| A| 1| true|121.44|
| B| 2|false|300.01|
| C| 3| NULL| 10.99|
| E| 4| true| 33.87|
+--------+---+-----+------+
delimiter=';').load("movielens.csv")
In [32]: print(df1)
DataFrame[movieId: int, title: string, genres: string, userId: int, rating: doubl
e, timestamp: int]
In [33]: df1.show()
+-------+--------------------+--------------------+------+------+----------+
|movieId| title| genres|userId|rating| timestamp|
+-------+--------------------+--------------------+------+------+----------+
| 1| Toy Story (1995)|Adventure|Animati...| 7| 3.0| 851866703|
| 2| Jumanji (1995)|Adventure|Childre...| 15| 2.0|1134521380|
| 3|Grumpier Old Men ...| Comedy|Romance| 5| 4.0|1163374957|
| 4|Waiting to Exhale...|Comedy|Drama|Romance| 19| 3.0| 855192868|
| 5|Father of the Bri...| Comedy| 15| 4.5|1093070098|
| 6| Heat (1995)|Action|Crime|Thri...| 15| 4.0|1040205753|
| 7| Sabrina (1995)| Comedy|Romance| 18| 3.0| 856006982|
| 8| Tom and Huck (1995)| Adventure|Children| 30| 4.0| 968786809|
| 9| Sudden Death (1995)| Action| 18| 3.0| 856007219|
| 10| GoldenEye (1995)|Action|Adventure|...| 2| 4.0| 835355493|
| 11|American Presiden...|Comedy|Drama|Romance| 15| 2.5|1093028381|
| 12|Dracula: Dead and...| Comedy|Horror| 67| 3.0| 854711916|
| 13| Balto (1995)|Adventure|Animati...| 182| 3.0| 845745917|
| 14| Nixon (1995)| Drama| 15| 2.5|1166586286|
| 15|Cutthroat Island ...|Action|Adventure|...| 73| 2.5|1255593501|
| 16| Casino (1995)| Crime|Drama| 15| 3.5|1093070150|
| 17|Sense and Sensibi...| Drama|Romance| 2| 5.0| 835355681|
| 18| Four Rooms (1995)| Comedy| 18| 3.0| 856007359|
| 19|Ace Ventura: When...| Comedy| 15| 1.0|1093028409|
| 20| Money Train (1995)|Action|Comedy|Cri...| 23| 1.5|1148720884|
+-------+--------------------+--------------------+------+------+----------+
only showing top 20 rows
In [35]: df2.show()
+------------+-----------+------------+-----------+-----------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+-----------+
| 5.1| 3.5| 1.4| 0.2|Iris-setosa|
| 4.9| 3.0| 1.4| 0.2|Iris-setosa|
| 4.7| 3.2| 1.3| 0.2|Iris-setosa|
| 4.6| 3.1| 1.5| 0.2|Iris-setosa|
| 5.0| 3.6| 1.4| 0.2|Iris-setosa|
| 5.4| 3.9| 1.7| 0.4|Iris-setosa|
| 4.6| 3.4| 1.4| 0.3|Iris-setosa|
| 5.0| 3.4| 1.5| 0.2|Iris-setosa|
| 4.4| 2.9| 1.4| 0.2|Iris-setosa|
| 4.9| 3.1| 1.5| 0.1|Iris-setosa|
| 5.4| 3.7| 1.5| 0.2|Iris-setosa|
| 4.8| 3.4| 1.6| 0.2|Iris-setosa|
| 4.8| 3.0| 1.4| 0.1|Iris-setosa|
| 4.3| 3.0| 1.1| 0.1|Iris-setosa|
| 5.8| 4.0| 1.2| 0.2|Iris-setosa|
| 5.7| 4.4| 1.5| 0.4|Iris-setosa|
| 5.4| 3.9| 1.3| 0.4|Iris-setosa|
| 5.1| 3.5| 1.4| 0.3|Iris-setosa|
| 5.7| 3.8| 1.7| 0.3|Iris-setosa|
| 5.1| 3.8| 1.5| 0.3|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows
In [36]: # alternate way to create a DataFrame from CSV file
In [38]: df3.show()
+------------+-----------+------------+-----------+-----------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+-----------+
| 5.1| 3.5| 1.4| 0.2|Iris-setosa|
| 4.9| 3| 1.4| 0.2|Iris-setosa|
| 4.7| 3.2| 1.3| 0.2|Iris-setosa|
| 4.6| 3.1| 1.5| 0.2|Iris-setosa|
| 5| 3.6| 1.4| 0.2|Iris-setosa|
| 5.4| 3.9| 1.7| 0.4|Iris-setosa|
| 4.6| 3.4| 1.4| 0.3|Iris-setosa|
| 5| 3.4| 1.5| 0.2|Iris-setosa|
| 4.4| 2.9| 1.4| 0.2|Iris-setosa|
| 4.9| 3.1| 1.5| 0.1|Iris-setosa|
| 5.4| 3.7| 1.5| 0.2|Iris-setosa|
| 4.8| 3.4| 1.6| 0.2|Iris-setosa|
| 4.8| 3| 1.4| 0.1|Iris-setosa|
| 4.3| 3| 1.1| 0.1|Iris-setosa|
| 5.8| 4| 1.2| 0.2|Iris-setosa|
| 5.7| 4.4| 1.5| 0.4|Iris-setosa|
| 5.4| 3.9| 1.3| 0.4|Iris-setosa|
| 5.1| 3.5| 1.4| 0.3|Iris-setosa|
| 5.7| 3.8| 1.7| 0.3|Iris-setosa|
| 5.1| 3.8| 1.5| 0.3|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows
In [40]: df_dict.show()
+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
| A| 1| true|121.44|
| B| 2|false|300.01|
| C| 3| NULL| 10.99|
| E| 4| true| 33.87|
+--------+---+-----+------+
+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
| E| 4| true| 33.87|
| A| 1| true|121.44|
+--------+---+-----+------+
+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
| A| 1| true|121.44|
| E| 4| true| 33.87|
+--------+---+-----+------+
In [44]: b = a.sort(df_dict.Value)
In [45]: b.show()
+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
| E| 4| true| 33.87|
| A| 1| true|121.44|
+--------+---+-----+------+
In [47]: df_dict.createOrReplaceTempView('table')
In [49]: c.show()
+--------+---+-----+------+
|Category| ID|Truth| Value|
+--------+---+-----+------+
| E| 4| true| 33.87|
| A| 1| true|121.44|
+--------+---+-----+------+
In [51]: df3.show()
+------------+-----------+------------+-----------+-----------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+-----------+
| 5.1| 3.5| 1.4| 0.2|Iris-setosa|
| 4.9| 3| 1.4| 0.2|Iris-setosa|
| 4.7| 3.2| 1.3| 0.2|Iris-setosa|
| 4.6| 3.1| 1.5| 0.2|Iris-setosa|
| 5| 3.6| 1.4| 0.2|Iris-setosa|
| 5.4| 3.9| 1.7| 0.4|Iris-setosa|
| 4.6| 3.4| 1.4| 0.3|Iris-setosa|
| 5| 3.4| 1.5| 0.2|Iris-setosa|
| 4.4| 2.9| 1.4| 0.2|Iris-setosa|
| 4.9| 3.1| 1.5| 0.1|Iris-setosa|
| 5.4| 3.7| 1.5| 0.2|Iris-setosa|
| 4.8| 3.4| 1.6| 0.2|Iris-setosa|
| 4.8| 3| 1.4| 0.1|Iris-setosa|
| 4.3| 3| 1.1| 0.1|Iris-setosa|
| 5.8| 4| 1.2| 0.2|Iris-setosa|
| 5.7| 4.4| 1.5| 0.4|Iris-setosa|
| 5.4| 3.9| 1.3| 0.4|Iris-setosa|
| 5.1| 3.5| 1.4| 0.3|Iris-setosa|
| 5.7| 3.8| 1.7| 0.3|Iris-setosa|
| 5.1| 3.8| 1.5| 0.3|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows
+------------+-----------+------------+-----------+-----------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+-----------+
| 5.1| 3.5| 1.4| 0.2|Iris-setosa|
| 4.9| 3| 1.4| 0.2|Iris-setosa|
| 4.7| 3.2| 1.3| 0.2|Iris-setosa|
| 4.6| 3.1| 1.5| 0.2|Iris-setosa|
| 5| 3.6| 1.4| 0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows
root
|-- Sepal length: string (nullable = true)
|-- Sepal width: string (nullable = true)
|-- Petal length: string (nullable = true)
|-- Petal width: string (nullable = true)
|-- Class: string (nullable = true)
In [55]: df3.select('Class').show(5)
+-----------+
| Class|
+-----------+
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
|Iris-setosa|
+-----------+
only showing top 5 rows
In [56]: # Select an output few rows of the Sepal length & Petal length column
df3.select('Sepal length', 'Petal Length').show(5)
+------------+------------+
|Sepal length|Petal Length|
+------------+------------+
| 5.1| 1.4|
| 4.9| 1.4|
| 4.7| 1.3|
| 4.6| 1.5|
| 5| 1.4|
+------------+------------+
only showing top 5 rows
In [57]: # Create a new column by converting the petal length value in 0 to 1 range
df3.withColumn("Petal_length_norm",df3["Petal Length"]/10).show(5)
+------------+-----------+------------+-----------+-----------+------------------
-+
|Sepal length|Sepal width|Petal length|Petal width| Class| Petal_length_nor
m|
+------------+-----------+------------+-----------+-----------+------------------
-+
| 5.1| 3.5| 1.4| 0.2|Iris-setosa|0.1399999999999999
9|
| 4.9| 3| 1.4| 0.2|Iris-setosa|0.1399999999999999
9|
| 4.7| 3.2| 1.3| 0.2|Iris-setosa| 0.1
3|
| 4.6| 3.1| 1.5| 0.2|Iris-setosa| 0.1
5|
| 5| 3.6| 1.4| 0.2|Iris-setosa|0.1399999999999999
9|
+------------+-----------+------------+-----------+-----------+------------------
-+
only showing top 5 rows
+------------+-----------+------------+-----------+-----------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+-----------+
| 4.3| 3| 1.1| 0.1|Iris-setosa|
| 4.4| 3.2| 1.3| 0.2|Iris-setosa|
| 4.4| 2.9| 1.4| 0.2|Iris-setosa|
| 4.4| 3| 1.3| 0.2|Iris-setosa|
| 4.5| 2.3| 1.3| 0.3|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows
+------------+-----------+------------+-----------+--------------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+--------------+
| 7.9| 3.8| 6.4| 2|Iris-virginica|
| 7.7| 3.8| 6.7| 2.2|Iris-virginica|
| 7.7| 2.8| 6.7| 2|Iris-virginica|
| 7.7| 2.6| 6.9| 2.3|Iris-virginica|
| 7.7| 3| 6.1| 2.3|Iris-virginica|
+------------+-----------+------------+-----------+--------------+
only showing top 5 rows
+------------+-----------+------------+-----------+
|Sepal length|Sepal width|Petal length| Class|
+------------+-----------+------------+-----------+
| 5.1| 3.5| 1.4|Iris-setosa|
| 4.9| 3| 1.4|Iris-setosa|
| 4.7| 3.2| 1.3|Iris-setosa|
| 4.6| 3.1| 1.5|Iris-setosa|
| 5| 3.6| 1.4|Iris-setosa|
+------------+-----------+------------+-----------+
only showing top 5 rows
In [62]: df3.show()
+------------+-----------+------------+-----------+-----------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+-----------+
| 5.1| 3.5| 1.4| 0.2|Iris-setosa|
| 4.9| 3| 1.4| 0.2|Iris-setosa|
| 4.7| 3.2| 1.3| 0.2|Iris-setosa|
| 4.6| 3.1| 1.5| 0.2|Iris-setosa|
| 5| 3.6| 1.4| 0.2|Iris-setosa|
| 5.4| 3.9| 1.7| 0.4|Iris-setosa|
| 4.6| 3.4| 1.4| 0.3|Iris-setosa|
| 5| 3.4| 1.5| 0.2|Iris-setosa|
| 4.4| 2.9| 1.4| 0.2|Iris-setosa|
| 4.9| 3.1| 1.5| 0.1|Iris-setosa|
| 5.4| 3.7| 1.5| 0.2|Iris-setosa|
| 4.8| 3.4| 1.6| 0.2|Iris-setosa|
| 4.8| 3| 1.4| 0.1|Iris-setosa|
| 4.3| 3| 1.1| 0.1|Iris-setosa|
| 5.8| 4| 1.2| 0.2|Iris-setosa|
| 5.7| 4.4| 1.5| 0.4|Iris-setosa|
| 5.4| 3.9| 1.3| 0.4|Iris-setosa|
| 5.1| 3.5| 1.4| 0.3|Iris-setosa|
| 5.7| 3.8| 1.7| 0.3|Iris-setosa|
| 5.1| 3.8| 1.5| 0.3|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows
In [63]: # Show what are all the different levels of Class column
df3.select('Class').distinct().show()
+---------------+
| Class|
+---------------+
| Iris-virginica|
| Iris-setosa|
|Iris-versicolor|
+---------------+
root
|-- Sepal length: string (nullable = true)
|-- Sepal width: string (nullable = true)
|-- Petal length: string (nullable = true)
|-- Petal width: string (nullable = true)
|-- Class: string (nullable = true)
In [66]: df3.printSchema()
root
|-- Sepal length: integer (nullable = true)
|-- Sepal width: integer (nullable = true)
|-- Petal length: integer (nullable = true)
|-- Petal width: integer (nullable = true)
|-- Class: string (nullable = true)
In [67]: # Find the sum of all columns for each class of iris
df3.select('Class', 'Petal width').groupBy('Class').sum('Petal width').show()
+---------------+----------------+
| Class|sum(Petal width)|
+---------------+----------------+
| Iris-virginica| 79|
| Iris-setosa| 0|
|Iris-versicolor| 50|
+---------------+----------------+
+------------+-----------+------------+-----------+--------------+
|Sepal length|Sepal width|Petal length|Petal width| Class|
+------------+-----------+------------+-----------+--------------+
| 6| 3| 6| 2|Iris-virginica|
| 7| 3| 6| 2|Iris-virginica|
| 7| 2| 6| 1|Iris-virginica|
| 7| 3| 6| 2|Iris-virginica|
| 7| 3| 6| 2|Iris-virginica|
| 7| 2| 6| 2|Iris-virginica|
| 7| 2| 6| 2|Iris-virginica|
| 7| 3| 6| 1|Iris-virginica|
| 7| 2| 6| 1|Iris-virginica|
| 7| 3| 6| 2|Iris-virginica|
| 7| 3| 6| 2|Iris-virginica|
+------------+-----------+------------+-----------+--------------+
In [70]: # Filter the dataframe and give total count of flowers where a petal Length grea
In [71]: df3.count()
Out[71]: 150
Out[72]: 11
In [74]: print(dir(functions))
['Any', 'ArrayType', 'Callable', 'Column', 'DataFrame', 'DataType', 'Dict', 'Iter
able', 'JVMView', 'List', 'Optional', 'PandasUDFType', 'PySparkTypeError', 'PySpa
rkValueError', 'SparkContext', 'StringType', 'StructType', 'TYPE_CHECKING', 'Tupl
e', 'Type', 'Union', 'UserDefinedFunction', 'UserDefinedTableFunction', 'ValuesVi
ew', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name_
_', '__package__', '__spec__', '_create_column_from_literal', '_create_lambda',
'_create_py_udf', '_create_py_udtf', '_from_numpy_type', '_get_jvm_function', '_g
et_lambda_parameters', '_invoke_binary_math_function', '_invoke_function', '_invo
ke_function_over_columns', '_invoke_function_over_seq_of_columns', '_invoke_highe
r_order_function', '_options_to_str', '_test', '_to_java_column', '_to_seq', '_un
resolved_named_lambda_variable', 'abs', 'acos', 'acosh', 'add_months', 'aes_decry
pt', 'aes_encrypt', 'aggregate', 'any_value', 'approxCountDistinct', 'approx_coun
t_distinct', 'approx_percentile', 'array', 'array_agg', 'array_append', 'array_co
mpact', 'array_contains', 'array_distinct', 'array_except', 'array_insert', 'arra
y_intersect', 'array_join', 'array_max', 'array_min', 'array_position', 'array_pr
epend', 'array_remove', 'array_repeat', 'array_size', 'array_sort', 'array_unio
n', 'arrays_overlap', 'arrays_zip', 'asc', 'asc_nulls_first', 'asc_nulls_last',
'ascii', 'asin', 'asinh', 'assert_true', 'atan', 'atan2', 'atanh', 'avg', 'base6
4', 'bin', 'bit_and', 'bit_count', 'bit_get', 'bit_length', 'bit_or', 'bit_xor',
'bitmap_bit_position', 'bitmap_bucket_number', 'bitmap_construct_agg', 'bitmap_co
unt', 'bitmap_or_agg', 'bitwiseNOT', 'bitwise_not', 'bool_and', 'bool_or', 'broad
cast', 'bround', 'btrim', 'bucket', 'call_function', 'call_udf', 'cardinality',
'cast', 'cbrt', 'ceil', 'ceiling', 'char', 'char_length', 'character_length', 'co
alesce', 'col', 'collect_list', 'collect_set', 'column', 'concat', 'concat_ws',
'contains', 'conv', 'convert_timezone', 'corr', 'cos', 'cosh', 'cot', 'count', 'c
ountDistinct', 'count_distinct', 'count_if', 'count_min_sketch', 'covar_pop', 'co
var_samp', 'crc32', 'create_map', 'csc', 'cume_dist', 'curdate', 'current_catalo
g', 'current_database', 'current_date', 'current_schema', 'current_timestamp', 'c
urrent_timezone', 'current_user', 'date_add', 'date_diff', 'date_format', 'date_f
rom_unix_date', 'date_part', 'date_sub', 'date_trunc', 'dateadd', 'datediff', 'da
tepart', 'day', 'dayofmonth', 'dayofweek', 'dayofyear', 'days', 'decimal', 'decod
e', 'degrees', 'dense_rank', 'desc', 'desc_nulls_first', 'desc_nulls_last', 'e',
'element_at', 'elt', 'encode', 'endswith', 'equal_null', 'every', 'exists', 'ex
p', 'explode', 'explode_outer', 'expm1', 'expr', 'extract', 'factorial', 'filte
r', 'find_in_set', 'first', 'first_value', 'flatten', 'floor', 'forall', 'format_
number', 'format_string', 'from_csv', 'from_json', 'from_unixtime', 'from_utc_tim
estamp', 'functools', 'get', 'get_active_spark_context', 'get_json_object', 'getb
it', 'greatest', 'grouping', 'grouping_id', 'has_numpy', 'hash', 'hex', 'histogra
m_numeric', 'hll_sketch_agg', 'hll_sketch_estimate', 'hll_union', 'hll_union_ag
g', 'hour', 'hours', 'hypot', 'ifnull', 'ilike', 'initcap', 'inline', 'inline_out
er', 'input_file_block_length', 'input_file_block_start', 'input_file_name', 'ins
pect', 'instr', 'isnan', 'isnotnull', 'isnull', 'java_method', 'json_array_lengt
h', 'json_object_keys', 'json_tuple', 'kurtosis', 'lag', 'last', 'last_day', 'las
t_value', 'lcase', 'lead', 'least', 'left', 'length', 'levenshtein', 'like', 'li
t', 'ln', 'localtimestamp', 'locate', 'log', 'log10', 'log1p', 'log2', 'lower',
'lpad', 'ltrim', 'make_date', 'make_dt_interval', 'make_interval', 'make_timestam
p', 'make_timestamp_ltz', 'make_timestamp_ntz', 'make_ym_interval', 'map_concat',
'map_contains_key', 'map_entries', 'map_filter', 'map_from_arrays', 'map_from_ent
ries', 'map_keys', 'map_values', 'map_zip_with', 'mask', 'max', 'max_by', 'md5',
'mean', 'median', 'min', 'min_by', 'minute', 'mode', 'monotonically_increasing_i
d', 'month', 'months', 'months_between', 'named_struct', 'nanvl', 'negate', 'nega
tive', 'next_day', 'now', 'nth_value', 'ntile', 'nullif', 'nvl', 'nvl2', 'octet_l
ength', 'overlay', 'overload', 'pandas_udf', 'parse_url', 'percent_rank', 'percen
tile', 'percentile_approx', 'pi', 'pmod', 'posexplode', 'posexplode_outer', 'posi
tion', 'positive', 'pow', 'power', 'printf', 'product', 'quarter', 'radians', 'ra
ise_error', 'rand', 'randn', 'rank', 'reduce', 'reflect', 'regexp', 'regexp_coun
t', 'regexp_extract', 'regexp_extract_all', 'regexp_instr', 'regexp_like', 'regex
p_replace', 'regexp_substr', 'regr_avgx', 'regr_avgy', 'regr_count', 'regr_interc
ept', 'regr_r2', 'regr_slope', 'regr_sxx', 'regr_sxy', 'regr_syy', 'repeat', 'rep
lace', 'reverse', 'right', 'rint', 'rlike', 'round', 'row_number', 'rpad', 'rtri
m', 'schema_of_csv', 'schema_of_json', 'sec', 'second', 'sentences', 'sequence',
'session_window', 'sha', 'sha1', 'sha2', 'shiftLeft', 'shiftRight', 'shiftRightUn
signed', 'shiftleft', 'shiftright', 'shiftrightunsigned', 'shuffle', 'sign', 'sig
num', 'sin', 'sinh', 'size', 'skewness', 'slice', 'some', 'sort_array', 'sounde
x', 'spark_partition_id', 'split', 'split_part', 'sqrt', 'stack', 'startswith',
'std', 'stddev', 'stddev_pop', 'stddev_samp', 'str_to_map', 'struct', 'substr',
'substring', 'substring_index', 'sum', 'sumDistinct', 'sum_distinct', 'sys', 'ta
n', 'tanh', 'timestamp_micros', 'timestamp_millis', 'timestamp_seconds', 'toDegre
es', 'toRadians', 'to_binary', 'to_char', 'to_csv', 'to_date', 'to_json', 'to_num
ber', 'to_str', 'to_timestamp', 'to_timestamp_ltz', 'to_timestamp_ntz', 'to_unix_
timestamp', 'to_utc_timestamp', 'to_varchar', 'transform', 'transform_keys', 'tra
nsform_values', 'translate', 'trim', 'trunc', 'try_add', 'try_aes_decrypt', 'try_
avg', 'try_divide', 'try_element_at', 'try_multiply', 'try_remote_functions', 'tr
y_subtract', 'try_sum', 'try_to_binary', 'try_to_number', 'try_to_timestamp', 'ty
peof', 'ucase', 'udf', 'udtf', 'unbase64', 'unhex', 'unix_date', 'unix_micros',
'unix_millis', 'unix_seconds', 'unix_timestamp', 'unwrap_udt', 'upper', 'url_deco
de', 'url_encode', 'user', 'var_pop', 'var_samp', 'variance', 'version', 'warning
s', 'weekday', 'weekofyear', 'when', 'width_bucket', 'window', 'window_time', 'xp
ath', 'xpath_boolean', 'xpath_double', 'xpath_float', 'xpath_int', 'xpath_long',
'xpath_number', 'xpath_short', 'xpath_string', 'xxhash64', 'year', 'years', 'zip_
with']
In [75]: dir
In [77]: help(functions.upper)
Help on function upper in module pyspark.sql.functions:
.. versionadded:: 1.5.0
.. versionchanged:: 3.4.0
Supports Spark Connect.
Parameters
----------
col : :class:`~pyspark.sql.Column` or str
target column to work on.
Returns
-------
:class:`~pyspark.sql.Column`
upper case values.
Examples
--------
>>> df = spark.createDataFrame(["Spark", "PySpark", "Pandas API"], "STRING")
>>> df.select(upper("value")).show()
+------------+
|upper(value)|
+------------+
| SPARK|
| PYSPARK|
| PANDAS API|
+------------+
In [78]: df3.select(upper("Class")).show(5)
+------------+
|upper(Class)|
+------------+
| IRIS-SETOSA|
| IRIS-SETOSA|
| IRIS-SETOSA|
| IRIS-SETOSA|
| IRIS-SETOSA|
+------------+
only showing top 5 rows
+-----------------+
|min(Petal length)|
+-----------------+
| 1|
+-----------------+
+------------------+
| avg(Petal length)|
+------------------+
|3.3066666666666666|
+------------------+
In [83]: df3.show()
+------------+-----------+------------+-----------+-----------+
|Sepal length|Sepal width|Petal length|Petal width| Species|
+------------+-----------+------------+-----------+-----------+
| 5| 3| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
| 4| 2| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 4| 3| 1| 0|Iris-setosa|
| 5| 4| 1| 0|Iris-setosa|
| 5| 4| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
| 5| 3| 1| 0|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 20 rows