log merging breaks in sample after few retries #136 (#137)

tarasglek · danthegoodman1 · web-flow · commit 03b9c9eff910 · 2025-01-06T10:12:17.000-08:00
* Fix logs/data getting out of sync during cleanup

* Update icedb/icedb.py

kept because we must preserve tombstones for `min_age`

Co-authored-by: Dan Goodman &lt;xxdanthegoodmanxx@gmail.com&gt;

* Update icedb/icedb.py

the latest time at which a tombstone is allowed to be deleted

Co-authored-by: Dan Goodman &lt;xxdanthegoodmanxx@gmail.com&gt;

* refactor: Clarify comment describing tombstone expiration time calculation

* now change

* review feedback

---------

Co-authored-by: Dan Goodman &lt;xxdanthegoodmanxx@gmail.com&gt;
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 .parquet
 __pycache__
 .idea
+.aider*
diff --git a/examples/continuous-merge-and-tombtone.py b/examples/continuous-merge-and-tombtone.py
@@ -0,0 +1,159 @@
+from time import time, sleep
+from helpers import delete_all_s3
+import duckdb
+from icedb.log import S3Client, IceLogIO, FileMarker
+from icedb import IceDBv3, CompressionCodec
+from datetime import datetime
+
+# S3 configuration dictionary
+S3_CONFIG = {
+    "s3_region": "us-east-1",
+    "s3_endpoint": "http://localhost:9000",
+    "s3_access_key_id": "user", 
+    "s3_secret_access_key": "password",
+    "s3_use_ssl": False,
+    "s3_url_style": "path"  # can be 'path' or 'vhost'
+}
+
+# Bucket-specific S3 config not used by DuckDB
+S3_BUCKET_CONFIG = {
+    "bucket": "testbucket",
+    "prefix": "example",
+}
+
+# create an s3 client to talk to minio
+s3c = S3Client(
+    s3prefix=S3_BUCKET_CONFIG["prefix"],
+    s3bucket=S3_BUCKET_CONFIG["bucket"],
+    s3region=S3_CONFIG["s3_region"],
+    s3endpoint=S3_CONFIG["s3_endpoint"],
+    s3accesskey=S3_CONFIG["s3_access_key_id"],
+    s3secretkey=S3_CONFIG["s3_secret_access_key"]
+)
+
+example_events = [
+    {
+        "ts": 1686176939445,
+        "event": "page_load",
+        "user_id": "user_a",
+        "properties": {
+            "page_name": "Home"
+        }
+    }, {
+        "ts": 1676126229999,
+        "event": "page_load",
+        "user_id": "user_b",
+        "properties": {
+            "page_name": "Home"
+        }
+    }, {
+        "ts": 1686176939666,
+        "event": "page_load",
+        "user_id": "user_a",
+        "properties": {
+            "page_name": "Settings"
+        }
+    }, {
+        "ts": 1686176941445,
+        "event": "page_load",
+        "user_id": "user_a",
+        "properties": {
+            "page_name": "Home"
+        }
+    }
+]
+
+
+def part_func(row: dict) -> str:
+    """
+    Partition by user_id, date
+    """
+    row_time = datetime.utcfromtimestamp(row['ts'] / 1000)
+    part = f"u={row['user_id']}/d={row_time.strftime('%Y-%m-%d')}"
+    return part
+
+
+# Initialize the client
+ice = IceDBv3(
+    partition_function=part_func,  # Partitions by user_id and date
+    sort_order=['event', 'ts'],   # Sort by event, then timestamp of the event within the data part
+    # S3 settings from config
+    s3_region=S3_CONFIG["s3_region"],
+    s3_access_key=S3_CONFIG["s3_access_key_id"],
+    s3_secret_key=S3_CONFIG["s3_secret_access_key"],
+    s3_endpoint=S3_CONFIG["s3_endpoint"],
+    s3_use_path=S3_CONFIG["s3_url_style"] == "path",
+    # S3 client instance
+    s3_client=s3c,
+    # Other settings
+    path_safe_hostname="dan-mbp",
+    compression_codec=CompressionCodec.ZSTD,  # Use ZSTD for higher compression ratio compared to default SNAPPY
+)
+
+def once():
+
+
+
+    # Insert records
+    inserted = ice.insert(example_events)
+    print(f"{len(inserted)} created files (ice.insert): {', '.join(x.path for x in inserted)}")
+
+    # Read the log state
+    log = IceLogIO("demo-host")
+    _, file_markers, log_tombstones, log_files = log.read_at_max_time(s3c, round(time() * 1000))
+    print(f"{len(log_files)} log files: {', '.join(log_files)}")
+    print(f"{len(log_tombstones)} log tombstones: {', '.join(x.path for x in log_tombstones)}")
+    alive_files = list(filter(lambda x: x.tombstone is None, file_markers))
+    tombstoned_files = list(filter(lambda x: x.tombstone is not None, file_markers))
+    print(f"{len(alive_files)} alive files: {', '.join(x.path for x in alive_files)}")
+    print(f"{len(tombstoned_files)} tombstoned files: {', '.join(x.path for x in tombstoned_files)}")
+    print(f"file_markers: {file_markers}")
+    # Setup duckdb for querying local minio
+    ddb = duckdb.connect(":memory:")
+    ddb.execute("install httpfs")
+    ddb.execute("load httpfs")
+
+    # Set DuckDB S3 configuration from the config dictionary
+    for key, value in S3_CONFIG.items():
+        if key == "s3_endpoint":
+            # Strip protocol prefix by splitting on :// once
+            value = value.split("://", 1)[1]
+        ddb.execute(f"SET {key}='{value}'")
+
+    # Query alive files
+    query = ("select user_id, count(*), (properties::JSON)->>'page_name' as page "
+            "from read_parquet([{}]) "
+            "group by user_id, page "
+            "order by count(*) desc").format(
+        ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    )
+    print(ddb.sql(query))
+
+    new_log, new_file_marker, partition, merged_file_markers, meta = ice.merge()
+    if  partition is not None:  # if any merge happened
+        print(f"Merged partition: {partition}")
+        if merged_file_markers:
+            print(f"- {len(merged_file_markers)} source files merged: {', '.join(x.path for x in merged_file_markers)}")
+            print(f"- merged_file_markers {merged_file_markers}")
+        print(f"- into: {new_file_marker.path}")
+        print(f"- new log: {new_log}")
+
+    cleaned_logs, deleted_logs, deleted_data = ice.tombstone_cleanup(1_000)
+    print(f"{len(cleaned_logs)} cleaned log files: {', '.join(cleaned_logs)}")
+    print(f"{len(deleted_logs)} deleted log files: {', '.join(deleted_logs)}")
+    print(f"{len(deleted_data)} deleted data files: {', '.join(deleted_data)}")
+
+
+# wipe everything at the start
+delete_all_s3(s3c)
+
+for i in range(30):
+    try:
+        once()
+    except Exception as e:
+        print(f"Failed after {i} runs")
+        raise e
+    sleep(1)
+
+# wipe everything at the end if successful
+delete_all_s3(s3c)
diff --git a/icedb/icedb.py b/icedb/icedb.py
@@ -340,34 +340,46 @@ def tombstone_cleanup(self, min_age_ms: int) -> tuple[list[str], list[str], list
         now = round(time() * 1000)
 
         log_files_to_delete: dict[str, bool] = {}
+        log_files_to_keep: dict[str, LogTombstone] = {}
         data_files_to_delete: dict[str, bool] = {}
         data_files_to_keep: dict[str, FileMarker] = {}
         schema = Schema()
 
-        current_log_files = logio.get_current_log_files(self.log_s3c)
+        cur_schema, cur_files, cur_tombstones, all_log_files = logio.read_at_max_time(self.log_s3c, now)
+
         # We only need to get merge files
-        merge_log_files = list(filter(lambda x: get_log_file_info(x['Key'])[1], current_log_files))
-        for file in merge_log_files:
+        merge_log_files = list(filter(lambda x: get_log_file_info(x)[1], all_log_files))
+        
+        for log_file in merge_log_files:
             obj = self.log_s3c.s3.get_object(
                 Bucket=self.log_s3c.s3bucket,
-                Key=file['Key']
+                Key=log_file
             )
             jsonl = str(obj['Body'].read(), encoding="utf-8").split("\n")
             meta_json = json.loads(jsonl[0])
             meta = LogMetadataFromJSON(meta_json)
-
+            expired = now - min_age_ms  # time at which a tombstone is allowed to be deleted
             # Log tombstones
             if meta.tombstoneLineIndex is not None:
                 for i in range(meta.tombstoneLineIndex, meta.fileLineIndex):
                     tmb = LogTombstoneFromJSON(dict(json.loads(jsonl[i])))
-                    if tmb.createdMS <= now - min_age_ms:
+                    if tmb.createdMS <= expired:
                         log_files_to_delete[tmb.path] = True
-
+                    else:
+                        log_files_to_keep[tmb.path] = tmb
             # File markers
             for i in range(meta.fileLineIndex, len(jsonl)):
                 fm_json = dict(json.loads(jsonl[i]))
                 fm = FileMarkerFromJSON(fm_json)
-                if fm.createdMS <= now - min_age_ms and fm.tombstone is not None:
+
+                tombstone = fm.tombstone
+                if not tombstone:
+                    # find fm.path in cur_files
+                    for cf in cur_files:
+                        if cf.path == fm.path:
+                            tombstone = cf.tombstone
+                            break
+                if tombstone is not None and tombstone <= expired:
                     data_files_to_delete[fm.path] = True
                     if fm.path in data_files_to_keep:
                         del data_files_to_keep[fm.path]
@@ -378,7 +390,7 @@ def tombstone_cleanup(self, min_age_ms: int) -> tuple[list[str], list[str], list
             schema_json = dict(json.loads(jsonl[meta.schemaLineIndex]))
             schema.accumulate(list(schema_json.keys()), list(schema_json.values()))
 
-            cleaned_log_files.append(file['Key'])
+            cleaned_log_files.append(log_file)
 
         # Delete log tombstones
         for log_path in log_files_to_delete.keys():
@@ -394,7 +406,7 @@ def tombstone_cleanup(self, min_age_ms: int) -> tuple[list[str], list[str], list
                 Bucket=self.data_s3c.s3bucket,
                 Key=data_path
             )
-            deleted_log_files.append(data_path)
+            deleted_data_files.append(data_path)
 
 
 
@@ -404,7 +416,7 @@ def tombstone_cleanup(self, min_age_ms: int) -> tuple[list[str], list[str], list
             1,
             schema,
             list(data_files_to_keep.values()),
-            None,
+            list(log_files_to_keep.values()),  # kept because we must preserve tombstones for `min_age`
             merged=True,
             timestamp=round(time()*1000)
         )
@@ -574,4 +586,4 @@ def rewrite_partition(self, target_partition: str, filter_query: str) -> tuple[s
             merged=True
         )
 
-        return new_log, meta, list(map(lambda x: x.path, rewrite_targets))
+        return new_log, meta, list(map(lambda x: x.path, rewrite_targets))
diff --git a/setup.py b/setup.py
@@ -1,6 +1,6 @@
 from setuptools import setup, find_packages
 
-VERSION = '0.9.1'
+VERSION = '0.9.2'
 DESCRIPTION = 'IceDB'
 LONG_DESCRIPTION = 'Parquet merge engine'