How to perform union on two DataFrames with different amounts of columns in spark?

In Scala you just have to append all missing columns as nulls.

import org.apache.spark.sql.functions._

// let df1 and df2 the Dataframes to merge
val df1 = sc.parallelize(List(
  (50, 2),
  (34, 4)
)).toDF("age", "children")

val df2 = sc.parallelize(List(
  (26, true, 60000.00),
  (32, false, 35000.00)
)).toDF("age", "education", "income")

val cols1 = df1.columns.toSet
val cols2 = df2.columns.toSet
val total = cols1 ++ cols2 // union

def expr(myCols: Set[String], allCols: Set[String]) = {
  allCols.toList.map(x => x match {
    case x if myCols.contains(x) => col(x)
    case _ => lit(null).as(x)
  })
}

df1.select(expr(cols1, total):_*).unionAll(df2.select(expr(cols2, total):_*)).show()

+---+--------+---------+-------+
|age|children|education| income|
+---+--------+---------+-------+
| 50|       2|     null|   null|
| 34|       4|     null|   null|
| 26|    null|     true|60000.0|
| 32|    null|    false|35000.0|
+---+--------+---------+-------+

Update

Both temporal DataFrames will have the same order of columns, because we are mapping through total in both cases.

df1.select(expr(cols1, total):_*).show()
df2.select(expr(cols2, total):_*).show()

+---+--------+---------+------+
|age|children|education|income|
+---+--------+---------+------+
| 50|       2|     null|  null|
| 34|       4|     null|  null|
+---+--------+---------+------+

+---+--------+---------+-------+
|age|children|education| income|
+---+--------+---------+-------+
| 26|    null|     true|60000.0|
| 32|    null|    false|35000.0|
+---+--------+---------+-------+

Spark 3.1+

df = df1.unionByName(df2, allowMissingColumns=True)

Test results:

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

data1=[
(1 , '2016-08-29', 1 , 2, 3),
(2 , '2016-08-29', 1 , 2, 3),
(3 , '2016-08-29', 1 , 2, 3)]
df1 = spark.createDataFrame(data1, ['code' , 'date' , 'A' , 'B', 'C'])
data2=[
(5 , '2016-08-29', 1, 2, 3, 4),
(6 , '2016-08-29', 1, 2, 3, 4),
(7 , '2016-08-29', 1, 2, 3, 4)]
df2 = spark.createDataFrame(data2, ['code' , 'date' , 'B', 'C', 'D', 'E'])

df = df1.unionByName(df2, allowMissingColumns=True)
df.show()
#     +----+----------+----+---+---+----+----+
#     |code|      date|   A|  B|  C|   D|   E|
#     +----+----------+----+---+---+----+----+
#     |   1|2016-08-29|   1|  2|  3|null|null|
#     |   2|2016-08-29|   1|  2|  3|null|null|
#     |   3|2016-08-29|   1|  2|  3|null|null|
#     |   5|2016-08-29|null|  1|  2|   3|   4|
#     |   6|2016-08-29|null|  1|  2|   3|   4|
#     |   7|2016-08-29|null|  1|  2|   3|   4|
#     +----+----------+----+---+---+----+----+

Spark 2.3+

diff1 = [c for c in df2.columns if c not in df1.columns]
diff2 = [c for c in df1.columns if c not in df2.columns]
df = df1.select('*', *[F.lit(None).alias(c) for c in diff1]) \
    .unionByName(df2.select('*', *[F.lit(None).alias(c) for c in diff2]))

Test results:

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

data1=[
(1 , '2016-08-29', 1 , 2, 3),
(2 , '2016-08-29', 1 , 2, 3),
(3 , '2016-08-29', 1 , 2, 3)]
df1 = spark.createDataFrame(data1, ['code' , 'date' , 'A' , 'B', 'C'])
data2=[
(5 , '2016-08-29', 1, 2, 3, 4),
(6 , '2016-08-29', 1, 2, 3, 4),
(7 , '2016-08-29', 1, 2, 3, 4)]
df2 = spark.createDataFrame(data2, ['code' , 'date' , 'B', 'C', 'D', 'E'])

diff1 = [c for c in df2.columns if c not in df1.columns]
diff2 = [c for c in df1.columns if c not in df2.columns]
df = df1.select('*', *[F.lit(None).alias(c) for c in diff1]) \
    .unionByName(df2.select('*', *[F.lit(None).alias(c) for c in diff2]))
df.show()
#     +----+----------+----+---+---+----+----+
#     |code|      date|   A|  B|  C|   D|   E|
#     +----+----------+----+---+---+----+----+
#     |   1|2016-08-29|   1|  2|  3|null|null|
#     |   2|2016-08-29|   1|  2|  3|null|null|
#     |   3|2016-08-29|   1|  2|  3|null|null|
#     |   5|2016-08-29|null|  1|  2|   3|   4|
#     |   6|2016-08-29|null|  1|  2|   3|   4|
#     |   7|2016-08-29|null|  1|  2|   3|   4|
#     +----+----------+----+---+---+----+----+

Here is my Python version:

from pyspark.sql import SparkSession, HiveContext
from pyspark.sql.functions import lit
from pyspark.sql import Row

def customUnion(df1, df2):
    cols1 = df1.columns
    cols2 = df2.columns
    total_cols = sorted(cols1 + list(set(cols2) - set(cols1)))
    def expr(mycols, allcols):
        def processCols(colname):
            if colname in mycols:
                return colname
            else:
                return lit(None).alias(colname)
        cols = map(processCols, allcols)
        return list(cols)
    appended = df1.select(expr(cols1, total_cols)).union(df2.select(expr(cols2, total_cols)))
    return appended

Here is sample usage:

data = [
    Row(zip_code=58542, dma='MIN'),
    Row(zip_code=58701, dma='MIN'),
    Row(zip_code=57632, dma='MIN'),
    Row(zip_code=58734, dma='MIN')
]

firstDF = spark.createDataFrame(data)

data = [
    Row(zip_code='534', name='MIN'),
    Row(zip_code='353', name='MIN'),
    Row(zip_code='134', name='MIN'),
    Row(zip_code='245', name='MIN')
]

secondDF = spark.createDataFrame(data)

customUnion(firstDF,secondDF).show()

How to perform union on two DataFrames with different amounts of columns in spark?

Update

Related

Recent Posts