SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab...
1 of 7 15-06-2023, 11:40
# Load Data
target_df = spark.read.option("sep", ",").option("header", "true").option("inferSchema", "true").csv("/mnt/vp
source_df = spark.read.option("sep", ",").option("header", "true").option("inferSchema", "true").csv("/mnt/vp
# Delta tables
spark.sql("DROP TABLE IF EXISTS source")
spark.sql("CREATE TABLE source (ID STRING, Name STRING, Owner STRING, Description STRING) USING DELTA LOCATIO
spark.sql("TRUNCATE TABLE source")
source_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save('/mnt/vpa-raw-data-dev/P
from pyspark.sql.functions import *
from pyspark.sql.types import *
spark.sql("DROP TABLE IF EXISTS target")
spark.sql("CREATE TABLE target (ID STRING, Name STRING, Owner STRING, Description STRING, RowStatus STRING) U
SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab...
2 of 7 15-06-2023, 11:40
spark.sql("TRUNCATE TABLE target")
target_df = target_df.withColumn('RowStatus', lit(None).cast(StringType()))
target_df.write.mode("overwrite").format("delta").save("/mnt/vpa-raw-data-dev/POC/target")
%sql
-- Update SCD Type 2 rows (RowStatus = 2) and Insert Not Match rows (RowStatus = 3)
MERGE INTO target
USING source
ON target.ID = source.ID
WHEN MATCHED AND (NOT(source.Owner <=> target.Owner) OR NOT(source.Name <=> target.Name)) THEN
UPDATE SET
SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab...
3 of 7 15-06-2023, 11:40
target.RowStatus = 2,
target.Owner = source.Owner,
target.Name = source.Name,
target.Description = source.Description
WHEN NOT MATCHED
THEN INSERT (ID,Name,Owner,Description,RowStatus) VALUES (source.ID,source.Name,source.Owner,source.Descrip
-- Merge SCD Type 1 update (RowStatus = 1)
MERGE INTO target
USING source
ON target.ID = source.ID
WHEN MATCHED AND (target.RowStatus IS NULL) AND (NOT(source.Description <=> target.Description)) THEN
UPDATE SET
target.RowStatus = 1,
target.Description = source.Description
SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab...
4 of 7 15-06-2023, 11:40
#Filter out SCD Type 1 and 2 rows from target Delta table, and save into one temp table in Azure SQL
scd12_df = spark.sql("SELECT ID, Name, Owner, Description, RowStatus FROM target WHERE ( RowStatus = 2 OR Row
scd12_df.write.mode("overwrite").jdbc(url = jdbcUrl, table = "Scd_tmp", properties = connectionProperties)
%scala
import org.apache.spark.sql.SQLContext
import com.microsoft.azure.sqldb.spark.config.Config
import com.microsoft.azure.sqldb.spark.query._
import com.microsoft.azure.sqldb.spark.connect._
//Update columns value for those SCD Type 1 change only row
val scd1_query = """
SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab...
5 of 7 15-06-2023, 11:40
|UPDATE Scd
|SET Scd.Description = Scd_tmp.Description
|FROM Scd
|INNER JOIN Scd_tmp
|ON Scd.ID = Scd_tmp.ID AND Scd_tmp.RowStatus = '1';
""".stripMargin
val scd1_config = Config(Map(
"url" -> "dummy url",
"databaseName" -> "dummy databaseName",
"user" -> "dummy user",
"password" -> "dummy pwd",
"queryCustom" -> scd1_query
))
sqlContext.sqlDBQuery(scd1_config)
//Update SCD Type 2 row: Set Active_Record as 0, and Record_EndDate as current datatime.
val scd2_query2 = """
|UPDATE Scd
|SET Scd.Active_Record = '0', Scd.Record_EndDate = GETDATE()
|FROM Scd
|INNER JOIN Scd_tmp
|ON Scd.ID = Scd_tmp.ID AND Scd_tmp.RowStatus = '2';
""".stripMargin
val scd2_config = Config(Map(
"url" -> "dummy url",
"databaseName" -> "dummy databaseName",
"user" -> "dummy user",
"password" -> "dummy pwd",
"queryCustom" -> scd1_query
))
sqlContext.sqlDBQuery(scd2_config)
newinserted_df = spark.sql("SELECT ID, Name, Owner, Description FROM target WHERE RowStatus = '3'")
SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab...
6 of 7 15-06-2023, 11:40
newinserted_df.write.mode("append").jdbc(url = jdbcUrl, table = "Scd", properties = connectionProperties)
SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab...
7 of 7 15-06-2023, 11:40

SCD2-Implementation--inPySpark.pdf

  • 1.
    SCD Implementation withDatabricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab... 1 of 7 15-06-2023, 11:40
  • 2.
    # Load Data target_df= spark.read.option("sep", ",").option("header", "true").option("inferSchema", "true").csv("/mnt/vp source_df = spark.read.option("sep", ",").option("header", "true").option("inferSchema", "true").csv("/mnt/vp # Delta tables spark.sql("DROP TABLE IF EXISTS source") spark.sql("CREATE TABLE source (ID STRING, Name STRING, Owner STRING, Description STRING) USING DELTA LOCATIO spark.sql("TRUNCATE TABLE source") source_df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save('/mnt/vpa-raw-data-dev/P from pyspark.sql.functions import * from pyspark.sql.types import * spark.sql("DROP TABLE IF EXISTS target") spark.sql("CREATE TABLE target (ID STRING, Name STRING, Owner STRING, Description STRING, RowStatus STRING) U SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab... 2 of 7 15-06-2023, 11:40
  • 3.
    spark.sql("TRUNCATE TABLE target") target_df= target_df.withColumn('RowStatus', lit(None).cast(StringType())) target_df.write.mode("overwrite").format("delta").save("/mnt/vpa-raw-data-dev/POC/target") %sql -- Update SCD Type 2 rows (RowStatus = 2) and Insert Not Match rows (RowStatus = 3) MERGE INTO target USING source ON target.ID = source.ID WHEN MATCHED AND (NOT(source.Owner <=> target.Owner) OR NOT(source.Name <=> target.Name)) THEN UPDATE SET SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab... 3 of 7 15-06-2023, 11:40
  • 4.
    target.RowStatus = 2, target.Owner= source.Owner, target.Name = source.Name, target.Description = source.Description WHEN NOT MATCHED THEN INSERT (ID,Name,Owner,Description,RowStatus) VALUES (source.ID,source.Name,source.Owner,source.Descrip -- Merge SCD Type 1 update (RowStatus = 1) MERGE INTO target USING source ON target.ID = source.ID WHEN MATCHED AND (target.RowStatus IS NULL) AND (NOT(source.Description <=> target.Description)) THEN UPDATE SET target.RowStatus = 1, target.Description = source.Description SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab... 4 of 7 15-06-2023, 11:40
  • 5.
    #Filter out SCDType 1 and 2 rows from target Delta table, and save into one temp table in Azure SQL scd12_df = spark.sql("SELECT ID, Name, Owner, Description, RowStatus FROM target WHERE ( RowStatus = 2 OR Row scd12_df.write.mode("overwrite").jdbc(url = jdbcUrl, table = "Scd_tmp", properties = connectionProperties) %scala import org.apache.spark.sql.SQLContext import com.microsoft.azure.sqldb.spark.config.Config import com.microsoft.azure.sqldb.spark.query._ import com.microsoft.azure.sqldb.spark.connect._ //Update columns value for those SCD Type 1 change only row val scd1_query = """ SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab... 5 of 7 15-06-2023, 11:40
  • 6.
    |UPDATE Scd |SET Scd.Description= Scd_tmp.Description |FROM Scd |INNER JOIN Scd_tmp |ON Scd.ID = Scd_tmp.ID AND Scd_tmp.RowStatus = '1'; """.stripMargin val scd1_config = Config(Map( "url" -> "dummy url", "databaseName" -> "dummy databaseName", "user" -> "dummy user", "password" -> "dummy pwd", "queryCustom" -> scd1_query )) sqlContext.sqlDBQuery(scd1_config) //Update SCD Type 2 row: Set Active_Record as 0, and Record_EndDate as current datatime. val scd2_query2 = """ |UPDATE Scd |SET Scd.Active_Record = '0', Scd.Record_EndDate = GETDATE() |FROM Scd |INNER JOIN Scd_tmp |ON Scd.ID = Scd_tmp.ID AND Scd_tmp.RowStatus = '2'; """.stripMargin val scd2_config = Config(Map( "url" -> "dummy url", "databaseName" -> "dummy databaseName", "user" -> "dummy user", "password" -> "dummy pwd", "queryCustom" -> scd1_query )) sqlContext.sqlDBQuery(scd2_config) newinserted_df = spark.sql("SELECT ID, Name, Owner, Description FROM target WHERE RowStatus = '3'") SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab... 6 of 7 15-06-2023, 11:40
  • 7.
    newinserted_df.write.mode("append").jdbc(url = jdbcUrl,table = "Scd", properties = connectionProperties) SCD Implementation with Databricks Delta | zongbao.blog() http://www.yuzongbao.com/2019/08/05/scd-implementation-with-datab... 7 of 7 15-06-2023, 11:40