df = spark.read.format("csv")\\
.option("header", "true")\\
.option("inferSchema", "true")\\
.load("data/retail-data/all/*.csv")\\
.coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")
# COMMAND ----------
from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909
count
from pyspark.sql.functions import count
df.select(count("StockCode")).show() # 541909
countDistinct
from pyspark.sql.functions import countDistinct
df.select(countDistinct("StockCode")).show() # 4070