Pyspark Dataframe Convert country names to ISO codes with country-converter
Solution 1:
Update:
In case of huge data you can use dictionary to map all the values.
import country_converter as coco
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, lit, collect_set, create_map
from itertools import chain
data = [['United States'],['United Kingdom'],['Not a country']]*200000
df = spark.createDataFrame(data,['countries'])
#Create a {country:contry formatted} dictionary by only using unique values.
unique_countries = df.select("countries").distinct().rdd.flatMap(lambda x: x).collect()
unique_countries_formatted = coco.convert(unique_countries,to='ISO3', not_found=None)
uc_dict = dict(zip(unique_countries,unique_countries_formatted))
#create a map to apply on df
mapping_expr = create_map([lit(x) for x in chain(*uc_dict.items())])
#apply the df
df = df.withColumn('countries_formatted', mapping_expr[df.countries])
df.show()
Command took 8.85 seconds. 200k records under 8.5sec
You should use UDF
to do it.
import country_converter as coco
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf, lit
df = spark.createDataFrame([['United States'],
['United Kingdom'],
['Not a country'],
[None]],['countries'])
def country_converter(country): # define function here
if country:
return coco.convert(country, to='ISO3', not_found=None)
return None
cc_udf = udf(country_converter, StringType()) #register udf
df = df.withColumn("countries_formatted",cc_udf(df.countries))
df.show()
Output:
+--------------+-------------------+
| countries|countries_formatted|
+--------------+-------------------+
| United States| USA|
|United Kingdom| GBR|
| Not a country| Not a country|
| null| null|
+--------------+-------------------+