Skip to content

Commit c78f1e5

Browse files
committed
Misc dynamodb integration test improvements:
- DELTA_DYNAMO_TABLE_OVERWRITE env variable - add run_id column - cleanup
1 parent a990988 commit c78f1e5

File tree

1 file changed

+32
-24
lines changed

1 file changed

+32
-24
lines changed

storage-dynamodb/integration_tests/dynamodb_logstore.py

Lines changed: 32 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -16,26 +16,17 @@
1616
import os
1717
import sys
1818
import threading
19+
import random
20+
import uuid
1921

2022
from pyspark import SparkContext
21-
from pyspark.sql import Column, DataFrame, SparkSession, SQLContext, functions
22-
from pyspark.sql.functions import *
23+
from pyspark.sql import Column, DataFrame, SparkSession, SQLContext, functions as F
2324
from py4j.java_collections import MapConverter
2425
from delta.tables import *
2526
from multiprocessing.pool import ThreadPool
2627
import time
2728

2829
"""
29-
create required dynamodb table with:
30-
31-
$ aws --region us-west-2 dynamodb create-table \
32-
--table-name delta_log_test \
33-
--attribute-definitions AttributeName=tablePath,AttributeType=S \
34-
AttributeName=fileName,AttributeType=S \
35-
--key-schema AttributeName=tablePath,KeyType=HASH \
36-
AttributeName=fileName,KeyType=RANGE \
37-
--provisioned-throughput ReadCapacityUnits=5,WriteCapacityUnits=5
38-
3930
run this script in root dir of repository:
4031
4132
export VERSION=$(cat version.sbt|cut -d '"' -f 2)
@@ -44,7 +35,7 @@
4435
export DELTA_TABLE_PATH=s3a://test-bucket/delta-test/
4536
export DELTA_DYNAMO_TABLE=delta_log_test
4637
export DELTA_DYNAMO_REGION=us-west-2
47-
export DELTA_STORAGE=io.delta.storage.DynamoDBLogStoreScala # TODO: remove `Scala` when Java version finished
38+
export DELTA_STORAGE=io.delta.storage.DynamoDBLogStore
4839
export DELTA_NUM_ROWS=16
4940
5041
./run-integration-tests.py --run-storage-dynamodb-integration-tests \
@@ -59,11 +50,11 @@
5950
concurrent_readers = int(os.environ.get("DELTA_CONCURRENT_READERS", 2))
6051
num_rows = int(os.environ.get("DELTA_NUM_ROWS", 16))
6152

62-
# TODO change back to default io.delta.storage.DynamoDBLogStore
63-
delta_storage = os.environ.get("DELTA_STORAGE", "io.delta.storage.DynamoDBLogStoreScala")
53+
delta_storage = os.environ.get("DELTA_STORAGE", "io.delta.storage.DynamoDBLogStore")
6454
dynamo_table_name = os.environ.get("DELTA_DYNAMO_TABLE", "delta_log_test")
6555
dynamo_region = os.environ.get("DELTA_DYNAMO_REGION", "us-west-2")
6656
dynamo_error_rates = os.environ.get("DELTA_DYNAMO_ERROR_RATES", "")
57+
table_overwrite = os.environ.get("DELTA_DYNAMO_TABLE_OVERWRITE", "true").lower() == "true"
6758

6859
if delta_table_path is None:
6960
print(f"\nSkipping Python test {os.path.basename(__file__)} due to the missing env variable "
@@ -90,24 +81,41 @@
9081
.master("local[*]") \
9182
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
9283
.config("spark.delta.logStore.class", delta_storage) \
93-
.config("spark.delta.DynamoDBLogStoreScala.tableName", dynamo_table_name) \
94-
.config("spark.delta.DynamoDBLogStoreScala.region", dynamo_region) \
95-
.config("spark.delta.DynamoDBLogStoreScala.errorRates", dynamo_error_rates) \
84+
.config("spark.delta.DynamoDBLogStore.tableName", dynamo_table_name) \
85+
.config("spark.delta.DynamoDBLogStore.region", dynamo_region) \
86+
.config("spark.delta.DynamoDBLogStore.errorRates", dynamo_error_rates) \
9687
.getOrCreate()
9788

98-
data = spark.createDataFrame([], "id: int, a: int")
99-
data.write.format("delta").mode("overwrite").partitionBy("id").save(delta_table_path)
89+
SCHEMA = "run_id: string, id: int, a: int"
90+
91+
RUN_ID = str(uuid.uuid4())
92+
93+
data = spark.createDataFrame([], SCHEMA)
94+
95+
if table_overwrite:
96+
data.write.format("delta").mode("overwrite").partitionBy("run_id", "id").save(delta_table_path)
97+
10098

10199
def write_tx(n):
102-
data = spark.createDataFrame([[n, n]], "id: int, a: int")
103-
data.write.format("delta").mode("append").partitionBy("id").save(delta_table_path)
100+
data = spark.createDataFrame([[RUN_ID, random.randrange(2**16), n]], SCHEMA)
101+
data.write.format("delta").mode("append").partitionBy("run_id", "id").save(delta_table_path)
102+
103+
104+
def count():
105+
return (
106+
spark.read.format("delta")
107+
.load(delta_table_path)
108+
.filter(F.col("run_id") == RUN_ID)
109+
.count()
110+
)
104111

105112

106113
stop_reading = threading.Event()
107114

108115
def read_data():
109116
while not stop_reading.is_set():
110-
print("Reading {:d} rows ...".format(spark.read.format("delta").load(delta_table_path).distinct().count()))
117+
cnt = count()
118+
print(f"Reading {cnt} rows ...")
111119
time.sleep(1)
112120

113121

@@ -127,7 +135,7 @@ def start_read_thread():
127135
for thread in read_threads:
128136
thread.join()
129137

130-
actual = spark.read.format("delta").load(delta_table_path).distinct().count()
138+
actual = count()
131139
print("Number of written rows:", actual)
132140
assert actual == num_rows
133141

0 commit comments

Comments
 (0)