说明
Hive、Phoenix、Mysql、Oracle、SparkSQL、FlinkSQL函数
使用Spark SQL的内置函数,要导入SQLContext下的隐式转换(import org.apache.spark.sql.functions._),以下函数是基于spark2.0.2整理而来。
函数 | 说明 | ||
basic | |||
withExpr(expr: Expression): Column | |||
col(colName: String): Column | |||
column(colName: String): Column | |||
lit(literal: Any): Column | 返回一个常量列 | ||
stack | 多列转多行示例; | ||
字符函数 | |||
concat_ws(sep: String, exprs: Column*): Column | 多行转一列示例(结合collect_set); | ||
数字 | |||
rand(seed: Long): Column | |||
rand(): Column | 生成随机数,示例 | ||
randn(seed: Long): Column | |||
randn(): Column | |||
序号 | |||
monotonicallyIncreasingId(): Column | 生成ID,示例 | ||
排序 | |||
asc(columnName: String): Column | |||
desc(columnName: String): Column | |||
聚合(Aggregate) | |||
approxCountDistinct(e: Column): Column | |||
approxCountDistinct(columnName: String): Column | |||
approxCountDistinct(e: Column, rsd: Double): Column | |||
approxCountDistinct(columnName: String, rsd: Double): Column | |||
avg(e: Column): Column | |||
avg(columnName: String): Column | |||
collect_list(e: Column): Column | |||
collect_list(columnName: String): Column | |||
collect_set(e: Column): Column | |||
collect_set(columnName: String): Column | |||
corr(column1: Column, column2: Column): Column | |||
corr(columnName1: String, columnName2: String): Column | |||
count(e: Column): Column | 示例 | ||
count(columnName: String): TypedColumn[Any, Long] | |||
countDistinct(expr: Column, exprs: Column*): Column | |||
countDistinct(columnName: String, columnNames: String*): Column | |||
covar_pop(column1: Column, column2: Column): Column | |||
covar_pop(columnName1: String, columnName2: String): Column | |||
covar_samp(column1: Column, column2: Column): Column | |||
covar_samp(columnName1: String, columnName2: String): Column | |||
first(e: Column, ignoreNulls: Boolean): Column | |||
first(columnName: String, ignoreNulls: Boolean) | |||
first(e: Column): Column | |||
first(columnName: String): Column | |||
grouping(e: Column): Column | |||
grouping(columnName: String): Column | |||
grouping_id(cols: Column*): Column | |||
grouping_id(colName: String, colNames: String*): Column | |||
kurtosis(e: Column): Column | |||
kurtosis(columnName: String): Column | |||
last(e: Column, ignoreNulls: Boolean): Column | |||
last(columnName: String, ignoreNulls: Boolean): Column | |||
last(e: Column): Column | |||
last(columnName: String): Column | |||
max(e: Column): Column | 不支持arrayType | ||
max(columnName: String): Column | |||
mean(e: Column): Column | |||
mean(columnName: String): Column | |||
min(e: Column): Column | |||
min(columnName: String): Column | |||
skewness(e: Column): Column | |||
skewness(columnName: String): Column | |||
stddev(e: Column): Column | |||
stddev(columnName: String): Column | |||
stddev_samp(e: Column): Column | |||
stddev_samp(columnName: String): Column | |||
stddev_pop(e: Column): Column | |||
stddev_pop(columnName: String): Column | |||
sum(e: Column): Column | |||
sum(columnName: String): Column | spark-sql如果字段值有null,sum操作忽略null | ||
sumDistinct(e: Column): Column | |||
sumDistinct(columnName: String): Column | |||
variance(e: Column): Column | |||
variance(columnName: String): Column | |||
var_samp(e: Column): Column | |||
var_samp(columnName: String): Column | |||
var_pop(e: Column): Column | |||
var_pop(columnName: String): Column | |||
非聚合(Non-aggregate) | |||
coalesce | 返回一个不为null的值,coalesce($"age",lit(20)) | ||
聚集(Collection) | |||
explode(e: Column): Column | 一列转多行示例 | ||
posexplode(e: Column): Column | |||
窗口(Windows) | |||
cume_dist(): Column | |||
dense_rank(): Column | partitionBy与orderby都会导致序号值改变; | ||
lag(e: Column, offset: Int): Column | |||
lag(columnName: String, offset: Int): Column | |||
lag(columnName: String, offset: Int, defaultValue: Any): Column | |||
lag(e: Column, offset: Int, defaultValue: Any): Column | |||
lead(columnName: String, offset: Int): Column | |||
lead(e: Column, offset: Int): Column | |||
lead(columnName: String, offset: Int, defaultValue: Any): Column | |||
lead(e: Column, offset: Int, defaultValue: Any): Column | |||
ntile(n: Int): Column | |||
percent_rank(): Column | |||
rank(): Column | |||
row_number() over(Window.partitionBy(***)) | |||
Shuffle | |||
distinct | 去重 返回一个 dataframe 类型 | ||
countDistinct(colName: String) | 返回一个 number 类型的,返回 去重复的dataframe 集合的行数 | ||
groupBy(col1: String, cols: String*) | 根据某写字段来汇总返回 groupedate 类 型 df.groupBy("age").agg(Map("age" ->"count")).show();df.groupBy("age").avg().show();都可以 |
||
join(right: DataFrame, joinExprs: Column, joinType: String) | 一个是关联的 dataframe,第二个关联的条件,第三个关联的类型:inner, outer, left_outer, right_outer, leftsemi df.join(ds,df("name")===ds("name") and df("age")===ds("age"),"outer").show(); |
||
Column | |||
between(lowerBound: Any, upperBound: Any): Column | |||
dataset | coalesce(e: Column*): Column | coalesce与repartition使用 | |
repartition | |||
pivot | 多行转多列示例 | ||
Action | |||
collect() | 返回值是一个数组,返回 dataframe 集合所有的行 | ||
collectAsList() | 返回值是一个 java 类型的数组,返回 dataframe 集合所有的行 | ||
count() | 返回一个 number 类型的,返回 dataframe 集合的行数 | ||
describe(cols: String*) | 返回一个通过数学计算的类表值(count, mean, stddev, min, and max),这个可以传多个参数,中间用逗号分隔,如果有字段为空,那么不参与 运算,只这对数值类型的字段。例如 df.describe("age", "height").show() | ||
first() | 返回第一行 ,类型是 row 类型 | ||
head() | 返回第一行 ,类型是 row 类型 | ||
head(n:Int) | 返回 n 行 ,类型是 row 类型 | ||
show() | 返回 dataframe 集合的值 默认是 20 行,返回类型是 unit | ||
show(n:Int) | 返回 n 行,,返回值类型是 unit | ||
table(n:Int) | 返回 n 行 ,类型是 row 类型 | ||
SparkSession.read.orc(paths:String) | |||
SparkSession.read.orc(paths:String*) |
abs(e: Column): Column
array(cols: Column*): Column
array(colName: String, colNames: String*): Column
map(cols: Column*): Column
broadcast[T](df: Dataset[T]): Dataset[T]
input_file_name(): Column
isnan(e: Column): Column
isnull(e: Column): Column
monotonically_increasing_id(): Column
nanvl(col1: Column, col2: Column): Column
negate(e: Column): Column
not(e: Column): Column
spark_partition_id(): Column
sqrt(e: Column): Column
sqrt(colName: String): Column
struct(cols: Column*): Column
struct(colName: String, colNames: String*): Column
when(condition: Column, value: Any): Column
bitwiseNOT(e: Column): Column
expr(expr: String): Column
acos(e: Column): Column
acos(columnName: String): Column
asin(e: Column): Column
asin(columnName: String): Column
atan(e: Column): Column
atan(columnName: String): Column
atan2(l: Column, r: Column): Column
atan2(l: Column, rightName: String): Column
atan2(leftName: String, r: Column): Column
atan2(leftName: String, rightName: String): Column
atan2(l: Column, r: Double): Column
atan2(leftName: String, r: Double): Column
atan2(l: Double, r: Column): Column
atan2(l: Double, rightName: String): Column
bin(e: Column): Column
bin(columnName: String): Column
cbrt(e: Column): Column
cbrt(columnName: String): Column
ceil(e: Column): Column
ceil(columnName: String): Column
conv(num: Column, fromBase: Int, toBase: Int): Column
cos(e: Column): Column
cos(columnName: String): Column
cosh(e: Column): Column
cosh(columnName: String): Column
exp(e: Column): Column
exp(columnName: String): Column
expm1(e: Column): Column
expm1(columnName: String): Column
factorial(e: Column): Column
floor(e: Column): Column
floor(columnName: String): Column
greatest(exprs: Column*): Column
la.annotation.varargs
greatest(columnName: String, columnNames: String*):
hex(column: Column): Column
unhex(column: Column): Column
hypot(l: Column, r: Column): Column
hypot(l: Column, rightName: String): Column
hypot(leftName: String, r: Column): Column
hypot(leftName: String, rightName: String): Column
hypot(l: Column, r: Double): Column
hypot(leftName: String, r: Double): Column
hypot(l: Double, r: Column): Column
hypot(l: Double, rightName: String): Column
least(exprs: Column*): Column
least(columnName: String, columnNames: String*): Column
log(e: Column): Column
log(columnName: String): Column
log(base: Double, a: Column): Column
log(base: Double, columnName: String): Column
log10(e: Column): Column
log10(columnName: String): Column
log1p(e: Column): Column
log1p(columnName: String): Column
log2(expr: Column): Column
log2(columnName: String): Column
pow(l: Column, r: Column): Column
pow(l: Column, rightName: String): Column
pow(leftName: String, r: Column): Column
pow(leftName: String, rightName: String): Column
pow(l: Column, r: Double): Column
pow(leftName: String, r: Double): Column
pow(l: Double, r: Column): Column
pow(l: Double, rightName: String): Column
pmod(dividend: Column, divisor: Column): Column
rint(e: Column): Column
rint(columnName: String): Column
round(e: Column): Column
round(e: Column, scale: Int): Column
bround(e: Column): Column
bround(e: Column, scale: Int): Column
shiftLeft(e: Column, numBits: Int): Column
shiftRight(e: Column, numBits: Int): Column
shiftRightUnsigned(e: Column, numBits: Int): Column
signum(e: Column): Column
signum(columnName: String): Column
sin(e: Column): Column
sin(columnName: String): Column
sinh(e: Column): Column
sinh(columnName: String): Column
tan(e: Column): Column
tan(columnName: String): Column
tanh(e: Column): Column
tanh(columnName: String): Column
toDegrees(e: Column): Column
toDegrees(columnName: String): Column
toRadians(e: Column): Column
toRadians(columnName: String): Column
md5(e: Column): Column
sha1(e: Column): Column
sha2(e: Column, numBits: Int): Column
crc32(e: Column): Column
hash(cols: Column*): Column
ascii(e: Column): Column
base64(e: Column): Column
concat(exprs: Column*): Column
decode(value: Column, charset: String): Column
encode(value: Column, charset: String): Column
format_number(x: Column, d: Int): Column
format_string(format: String, arguments: Column*): Column
initcap(e: Column): Column
instr(str: Column, substring: String): Column
length(e: Column): Column 用于处理string、binary类型的数据(原因)
lower(e: Column): Column
levenshtein(l: Column, r: Column): Column
locate(substr: String, str: Column): Column
locate(substr: String, str: Column, pos: Int): Column
lpad(str: Column, len: Int, pad: String): Column
ltrim(e: Column): Column
regexp_extract(e: Column, exp: String, groupIdx: Int): Column
regexp_replace(e: Column, pattern: String, replacement: String): Column 使用示例
unbase64(e: Column): Column
rpad(str: Column, len: Int, pad: String): Column
repeat(str: Column, n: Int): Column
reverse(str: Column): Column
rtrim(e: Column): Column
soundex(e: Column): Column
split(str: Column, pattern: String): Column
substring(str: Column, pos: Int, len: Int): Column
substring_index(str: Column, delim: String, count: Int): Column 具体用法
translate(src: Column, matchingString: String, replaceString: String): Column
trim(e: Column): Column = withExpr { StringTrim(e.expr) }
upper(e: Column): Column = withExpr { Upper(e.expr) }
add_months(startDate: Column, numMonths: Int): Column
current_date(): Column
current_timestamp(): Column
date_format(dateExpr: Column, format: String): Column
date_add(start: Column, days: Int): Column
date_sub(start: Column, days: Int): Column
datediff(end: Column, start: Column): Column
year(e: Column): Column
quarter(e: Column): Column
month(e: Column): Column
dayofmonth(e: Column): Column
dayofyear(e: Column): Column
hour(e: Column): Column
last_day(e: Column): Column
minute(e: Column): Column
months_between(date1: Column, date2: Column): Column
next_day(date: Column, dayOfWeek: String): Column
second(e: Column): Column
weekofyear(e: Column): Column
from_unixtime(ut: Column): Column
from_unixtime(ut: Column, f: String): Column
unix_timestamp(): Column
unix_timestamp(s: Column): Column
unix_timestamp(s: Column, p: String): Column
to_date(e: Column): Column
trunc(date: Column, format: String): Column
from_utc_timestamp(ts: Column, tz: String): Column
to_utc_timestamp(ts: Column, tz: String): Column
window(timeColumn: Column,windowDuration: String,slideDuration: String,startTime: String): Column
window(timeColumn: Column, windowDuration: String, slideDuration: String): Column
window(timeColumn: Column, windowDuration: String): Column
array_contains(column: Column, value: Any): Column
get_json_object(e: Column, path: String): Column
json_tuple(json: Column, fields: String*): Column
size(e: Column): Column 用于处理array、map类型的数据(原因)
sort_array(e: Column): Column
sort_array(e: Column, asc: Boolean): Column
udf[$typeTags](f: Function$x[$types]): UserDefinedFunction
udf[RT: TypeTag](f: Function0[RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag](f: Function1[A1, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag](f: Function2[A1, A2, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag](f: Function3[A1, A2, A3, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag](f: Function4[A1, A2, A3, A4, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag](f: Function5[A1, A2, A3, A4, A5, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag](f: Function6[A1, A2, A3, A4, A5, A6, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag](f: Function7[A1, A2, A3, A4, A5, A6, A7, RT]): UserDefinedFunction
def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag](f: Function8[A1, A2, A3, A4, A5, A6, A7, A8, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag](f: Function9[A1, A2, A3, A4, A5, A6, A7, A8, A9, RT]): UserDefinedFunction
udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag, A6: TypeTag, A7: TypeTag, A8: TypeTag, A9: TypeTag, A10: TypeTag](f: Function10[A1, A2, A3, A4, A5, A6, A7, A8, A9, A10, RT]): UserDefinedFunction
udf(f: AnyRef, dataType: DataType): UserDefinedFunction
callUDF(udfName: String, cols: Column*): Column
注意:本文归作者所有,未经作者允许,不得转载