Pyspark Dataframe Convert country names to ISO codes with country-converter

Solution 1:

Update:

In case of huge data you can use dictionary to map all the values.

import country_converter as coco
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, lit, collect_set, create_map
from itertools import chain

data = [['United States'],['United Kingdom'],['Not a country']]*200000
df = spark.createDataFrame(data,['countries'])

#Create a {country:contry formatted} dictionary by only using unique values.
unique_countries = df.select("countries").distinct().rdd.flatMap(lambda x: x).collect() 
unique_countries_formatted = coco.convert(unique_countries,to='ISO3', not_found=None)
uc_dict = dict(zip(unique_countries,unique_countries_formatted))

#create a map to apply on df
mapping_expr = create_map([lit(x) for x in chain(*uc_dict.items())])

#apply the df
df = df.withColumn('countries_formatted', mapping_expr[df.countries])

df.show()

Command took 8.85 seconds. 200k records under 8.5sec

You should use UDF to do it.

import country_converter as coco
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, lit

df = spark.createDataFrame([['United States'],
                            ['United Kingdom'],
                            ['Not a country'],
                            [None]],['countries'])

def country_converter(country): # define function here
  if country:
    return coco.convert(country, to='ISO3', not_found=None)
  return None

cc_udf = udf(country_converter, StringType()) #register udf
df = df.withColumn("countries_formatted",cc_udf(df.countries))
df.show()

Output:

+--------------+-------------------+
|     countries|countries_formatted|
+--------------+-------------------+
| United States|                USA|
|United Kingdom|                GBR|
| Not a country|      Not a country|
|          null|               null|
+--------------+-------------------+