Adding a column counting cumulative pervious repeating values
One solution is to use the difference between row numbers to create groups. And then use the group
column to calculate row_number()
as result:
from pyspark.sql import functions as F, Window
df = spark.createDataFrame([(0,), (0,), (5,), (5,), (-1,), (0,), (0,), (0,)], ["Value"])
df.withColumn("ID", F.monotonically_increasing_id()) \
.withColumn("group",
F.row_number().over(Window.orderBy("ID"))
- F.row_number().over(Window.partitionBy("Value").orderBy("Value"))
) \
.withColumn("Result", F.row_number().over(Window.partitionBy("group").orderBy("Value"))) \
.drop("ID", "group")\
.show()
#+-----+------+
#|Value|Result|
#+-----+------+
#| 0| 1|
#| 0| 2|
#| 5| 1|
#| 5| 2|
#| 0| 1|
#| 0| 2|
#| 0| 3|
#| -1| 1|
#+-----+------+