|
| 1 | +from time import time, sleep |
| 2 | +from helpers import delete_all_s3 |
| 3 | +import duckdb |
| 4 | +from icedb.log import S3Client, IceLogIO, FileMarker |
| 5 | +from icedb import IceDBv3, CompressionCodec |
| 6 | +from datetime import datetime |
| 7 | + |
| 8 | +# S3 configuration dictionary |
| 9 | +S3_CONFIG = { |
| 10 | + "s3_region": "us-east-1", |
| 11 | + "s3_endpoint": "http://localhost:9000", |
| 12 | + "s3_access_key_id": "user", |
| 13 | + "s3_secret_access_key": "password", |
| 14 | + "s3_use_ssl": False, |
| 15 | + "s3_url_style": "path" # can be 'path' or 'vhost' |
| 16 | +} |
| 17 | + |
| 18 | +# Bucket-specific S3 config not used by DuckDB |
| 19 | +S3_BUCKET_CONFIG = { |
| 20 | + "bucket": "testbucket", |
| 21 | + "prefix": "example", |
| 22 | +} |
| 23 | + |
| 24 | +# create an s3 client to talk to minio |
| 25 | +s3c = S3Client( |
| 26 | + s3prefix=S3_BUCKET_CONFIG["prefix"], |
| 27 | + s3bucket=S3_BUCKET_CONFIG["bucket"], |
| 28 | + s3region=S3_CONFIG["s3_region"], |
| 29 | + s3endpoint=S3_CONFIG["s3_endpoint"], |
| 30 | + s3accesskey=S3_CONFIG["s3_access_key_id"], |
| 31 | + s3secretkey=S3_CONFIG["s3_secret_access_key"] |
| 32 | +) |
| 33 | + |
| 34 | +example_events = [ |
| 35 | + { |
| 36 | + "ts": 1686176939445, |
| 37 | + "event": "page_load", |
| 38 | + "user_id": "user_a", |
| 39 | + "properties": { |
| 40 | + "page_name": "Home" |
| 41 | + } |
| 42 | + }, { |
| 43 | + "ts": 1676126229999, |
| 44 | + "event": "page_load", |
| 45 | + "user_id": "user_b", |
| 46 | + "properties": { |
| 47 | + "page_name": "Home" |
| 48 | + } |
| 49 | + }, { |
| 50 | + "ts": 1686176939666, |
| 51 | + "event": "page_load", |
| 52 | + "user_id": "user_a", |
| 53 | + "properties": { |
| 54 | + "page_name": "Settings" |
| 55 | + } |
| 56 | + }, { |
| 57 | + "ts": 1686176941445, |
| 58 | + "event": "page_load", |
| 59 | + "user_id": "user_a", |
| 60 | + "properties": { |
| 61 | + "page_name": "Home" |
| 62 | + } |
| 63 | + } |
| 64 | +] |
| 65 | + |
| 66 | + |
| 67 | +def part_func(row: dict) -> str: |
| 68 | + """ |
| 69 | + Partition by user_id, date |
| 70 | + """ |
| 71 | + row_time = datetime.utcfromtimestamp(row['ts'] / 1000) |
| 72 | + part = f"u={row['user_id']}/d={row_time.strftime('%Y-%m-%d')}" |
| 73 | + return part |
| 74 | + |
| 75 | + |
| 76 | +# Initialize the client |
| 77 | +ice = IceDBv3( |
| 78 | + partition_function=part_func, # Partitions by user_id and date |
| 79 | + sort_order=['event', 'ts'], # Sort by event, then timestamp of the event within the data part |
| 80 | + # S3 settings from config |
| 81 | + s3_region=S3_CONFIG["s3_region"], |
| 82 | + s3_access_key=S3_CONFIG["s3_access_key_id"], |
| 83 | + s3_secret_key=S3_CONFIG["s3_secret_access_key"], |
| 84 | + s3_endpoint=S3_CONFIG["s3_endpoint"], |
| 85 | + s3_use_path=S3_CONFIG["s3_url_style"] == "path", |
| 86 | + # S3 client instance |
| 87 | + s3_client=s3c, |
| 88 | + # Other settings |
| 89 | + path_safe_hostname="dan-mbp", |
| 90 | + compression_codec=CompressionCodec.ZSTD, # Use ZSTD for higher compression ratio compared to default SNAPPY |
| 91 | +) |
| 92 | + |
| 93 | +def once(): |
| 94 | + |
| 95 | + |
| 96 | + |
| 97 | + # Insert records |
| 98 | + inserted = ice.insert(example_events) |
| 99 | + print(f"{len(inserted)} created files (ice.insert): {', '.join(x.path for x in inserted)}") |
| 100 | + |
| 101 | + # Read the log state |
| 102 | + log = IceLogIO("demo-host") |
| 103 | + _, file_markers, log_tombstones, log_files = log.read_at_max_time(s3c, round(time() * 1000)) |
| 104 | + print(f"{len(log_files)} log files: {', '.join(log_files)}") |
| 105 | + print(f"{len(log_tombstones)} log tombstones: {', '.join(x.path for x in log_tombstones)}") |
| 106 | + alive_files = list(filter(lambda x: x.tombstone is None, file_markers)) |
| 107 | + tombstoned_files = list(filter(lambda x: x.tombstone is not None, file_markers)) |
| 108 | + print(f"{len(alive_files)} alive files: {', '.join(x.path for x in alive_files)}") |
| 109 | + print(f"{len(tombstoned_files)} tombstoned files: {', '.join(x.path for x in tombstoned_files)}") |
| 110 | + print(f"file_markers: {file_markers}") |
| 111 | + # Setup duckdb for querying local minio |
| 112 | + ddb = duckdb.connect(":memory:") |
| 113 | + ddb.execute("install httpfs") |
| 114 | + ddb.execute("load httpfs") |
| 115 | + |
| 116 | + # Set DuckDB S3 configuration from the config dictionary |
| 117 | + for key, value in S3_CONFIG.items(): |
| 118 | + if key == "s3_endpoint": |
| 119 | + # Strip protocol prefix by splitting on :// once |
| 120 | + value = value.split("://", 1)[1] |
| 121 | + ddb.execute(f"SET {key}='{value}'") |
| 122 | + |
| 123 | + # Query alive files |
| 124 | + query = ("select user_id, count(*), (properties::JSON)->>'page_name' as page " |
| 125 | + "from read_parquet([{}]) " |
| 126 | + "group by user_id, page " |
| 127 | + "order by count(*) desc").format( |
| 128 | + ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files))) |
| 129 | + ) |
| 130 | + print(ddb.sql(query)) |
| 131 | + |
| 132 | + new_log, new_file_marker, partition, merged_file_markers, meta = ice.merge() |
| 133 | + if partition is not None: # if any merge happened |
| 134 | + print(f"Merged partition: {partition}") |
| 135 | + if merged_file_markers: |
| 136 | + print(f"- {len(merged_file_markers)} source files merged: {', '.join(x.path for x in merged_file_markers)}") |
| 137 | + print(f"- merged_file_markers {merged_file_markers}") |
| 138 | + print(f"- into: {new_file_marker.path}") |
| 139 | + print(f"- new log: {new_log}") |
| 140 | + |
| 141 | + cleaned_logs, deleted_logs, deleted_data = ice.tombstone_cleanup(1_000) |
| 142 | + print(f"{len(cleaned_logs)} cleaned log files: {', '.join(cleaned_logs)}") |
| 143 | + print(f"{len(deleted_logs)} deleted log files: {', '.join(deleted_logs)}") |
| 144 | + print(f"{len(deleted_data)} deleted data files: {', '.join(deleted_data)}") |
| 145 | + |
| 146 | + |
| 147 | +# wipe everything at the start |
| 148 | +delete_all_s3(s3c) |
| 149 | + |
| 150 | +for i in range(30): |
| 151 | + try: |
| 152 | + once() |
| 153 | + except Exception as e: |
| 154 | + print(f"Failed after {i} runs") |
| 155 | + raise e |
| 156 | + sleep(1) |
| 157 | + |
| 158 | +# wipe everything at the end if successful |
| 159 | +delete_all_s3(s3c) |
0 commit comments