Log data separation (#126)

danthegoodman1 · web-flow · commit ff7d35e8d36b · 2024-05-25T12:00:44.000-04:00
* use dedicated log s3

* working test

* readme

* readme
diff --git a/README.md b/README.md
@@ -44,12 +44,13 @@ the ClickHouse S3 function `s3('https://icedb-s3-proxy/**/*.parquet')` or DuckDB
     * [Why not Iceberg?](#why-not-iceberg)
     * [When not to use IceDB](#when-not-to-use-icedb)
   * [Tips before you dive in](#tips-before-you-dive-in)
-    * [Forcing number types](#forcing-number-types)
+    * [Forcing property types](#forcing-property-types)
     * [Insert in large batches](#insert-in-large-batches)
     * [Merge and Tombstone clean often](#merge-and-tombstone-clean-often)
     * [Large partitions, sort your data well!](#large-partitions-sort-your-data-well)
     * [Schema validation before insert](#schema-validation-before-insert)
     * [Tracking the running schema](#tracking-the-running-schema)
+    * [Separation of log and data](#separation-of-log-and-data)
   * [Usage](#usage)
     * [Partition function (`part_func`)](#partition-function-part_func)
     * [Sorting Order (`sort_order`)](#sorting-order-sort_order)
@@ -87,7 +88,7 @@ from datetime import datetime
 from time import time
 
 # create an s3 client to talk to minio
-s3c = S3Client(s3prefix="example", s3bucket="testbucket", s3region="us-east-1", s3endpoint="http://localhost:9000", 
+s3c = S3Client(s3prefix="example", s3bucket="testbucket", s3region="us-east-1", s3endpoint="http://localhost:9000",
                s3accesskey="user", s3secretkey="password")
 
 example_events = [
@@ -122,6 +123,7 @@ example_events = [
     }
 ]
 
+
 def part_func(row: dict) -> str:
     """
     Partition by user_id, date
@@ -130,6 +132,7 @@ def part_func(row: dict) -> str:
     part = f"u={row['user_id']}/d={row_time.strftime('%Y-%m-%d')}"
     return part
 
+
 # Initialize the client
 ice = IceDBv3(
     part_func,
@@ -169,7 +172,7 @@ query = ("select user_id, count(*), (properties::JSON)->>'page_name' as page "
          "from read_parquet([{}]) "
          "group by user_id, page "
          "order by count(*) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 ```
@@ -456,6 +459,12 @@ See a simple [example here](examples/verify-schema.py) on verifying the schema b
 IceDB will track the running schema natively. One caveat to this functionality is that if you remove a column as a 
 part of a partition rewrite and that column never returns, IceDB will not remove that from the schema.
 
+### Separation of log and data
+
+You can use the optional `log_s3_client` to use a different S3 client for log files. All instances of IceDB MUST have the same configuration in this regard.
+
+This is useful for when you may want to have the log in lower-latency time to first byte storage like S3 single zone express, but keep the data in lower cost storage like normal S3.
+
 ## Usage
 
 ```
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -34,6 +34,7 @@ services:
       sleep 1;
       /usr/bin/mc alias set myminio http://minio:9000 user password;
       /usr/bin/mc mb myminio/testbucket;
+      /usr/bin/mc mb myminio/testbucket-log;
       exit 0;
       "
 #  clickhouse:
diff --git a/examples/api-falcon.py b/examples/api-falcon.py
@@ -218,7 +218,7 @@ def on_get(self, req, resp):
                  "from read_parquet([{}]) "
                  "group by user_id, page "
                  "order by count(user_id) desc").format(
-            ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+            ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
         )
 
         # return the result as text
diff --git a/examples/api-flask.py b/examples/api-flask.py
@@ -210,7 +210,7 @@ def query_rows():
              "from read_parquet([{}]) "
              "group by user_id, page "
              "order by count(user_id) desc").format(
-        ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+        ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
     )
 
     # return the result as text
diff --git a/examples/custom-merge-aggregation-with-custom-insert.py b/examples/custom-merge-aggregation-with-custom-insert.py
@@ -107,7 +107,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -120,7 +120,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -138,7 +138,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -151,7 +151,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -170,7 +170,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -184,7 +184,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
diff --git a/examples/custom-merge-aggregation.py b/examples/custom-merge-aggregation.py
@@ -101,7 +101,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -114,7 +114,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -132,7 +132,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -145,7 +145,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -164,7 +164,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -178,7 +178,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
diff --git a/examples/custom-merge-replacing.py b/examples/custom-merge-replacing.py
@@ -126,7 +126,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -139,7 +139,7 @@ def part_func(row: dict) -> str:
 query = ("select user_id, arg_max(event, ts), max(ts)::INT8, arg_max(properties, ts) "
          "from read_parquet([{}]) "
          "group by user_id ").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -159,7 +159,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -172,7 +172,7 @@ def part_func(row: dict) -> str:
 query = ("select user_id, arg_max(event, ts), max(ts), arg_max(properties, ts) "
          "from read_parquet([{}]) "
          "group by user_id ").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -192,7 +192,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -207,7 +207,7 @@ def part_func(row: dict) -> str:
 query = ("select user_id, arg_max(event, ts), max(ts), arg_max(properties, ts) "
          "from read_parquet([{}]) "
          "group by user_id ").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
diff --git a/examples/materialized-view.py b/examples/materialized-view.py
@@ -113,7 +113,7 @@ def part_func_mv(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice_raw.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice_raw.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -126,7 +126,7 @@ def part_func_mv(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice_mv.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice_mv.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -147,7 +147,7 @@ def part_func_mv(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice_raw.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice_raw.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -160,7 +160,7 @@ def part_func_mv(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice_mv.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice_mv.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -188,7 +188,7 @@ def part_func_mv(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice_raw.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice_raw.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -202,7 +202,7 @@ def part_func_mv(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice_mv.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice_mv.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -216,7 +216,7 @@ def part_func_mv(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, event "
          "order by sum(cnt) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice_mv.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice_mv.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
diff --git a/examples/simple-full.py b/examples/simple-full.py
@@ -82,7 +82,7 @@ def part_func(row: dict) -> str:
 # Run the query
 query = ("select * "
          "from read_parquet([{}]) ").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -96,7 +96,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, page "
          "order by count(*) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -125,7 +125,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, page "
          "order by count(*) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
@@ -148,7 +148,7 @@ def part_func(row: dict) -> str:
          "from read_parquet([{}]) "
          "group by user_id, page "
          "order by count(*) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
diff --git a/examples/verify-schema.py b/examples/verify-schema.py
@@ -122,7 +122,7 @@ def check_schema_conflicts(old: Schema, new: Schema):
          "from read_parquet([{}]) "
          "group by user_id, page "
          "order by count(user_id) desc").format(
-    ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))
+    ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))
 )
 print(ddb.sql(query))
 
diff --git a/icedb/icedb.py b/icedb/icedb.py
diff --git a/integration_test.py b/integration_test.py
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@ services:`
`34`	`34`	`sleep 1;`
`35`	`35`	`/usr/bin/mc alias set myminio http://minio:9000 user password;`
`36`	`36`	`/usr/bin/mc mb myminio/testbucket;`
	`37`	`+ /usr/bin/mc mb myminio/testbucket-log;`
`37`	`38`	`exit 0;`
`38`	`39`	`"`
`39`	`40`	`# clickhouse:`
Original file line number	Diff line number	Diff line change
`@@ -218,7 +218,7 @@ def on_get(self, req, resp):`
`218`	`218`	`"from read_parquet([{}]) "`
`219`	`219`	`"group by user_id, page "`
`220`	`220`	`"order by count(user_id) desc").format(`
`221`		`- ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))`
	`221`	`+ ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))`
`222`	`222`	`)`
`223`	`223`
`224`	`224`	`# return the result as text`
Original file line number	Diff line number	Diff line change
`@@ -210,7 +210,7 @@ def query_rows():`
`210`	`210`	`"from read_parquet([{}]) "`
`211`	`211`	`"group by user_id, page "`
`212`	`212`	`"order by count(user_id) desc").format(`
`213`		`- ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))`
	`213`	`+ ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))`
`214`	`214`	`)`
`215`	`215`
`216`	`216`	`# return the result as text`
Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ def check_schema_conflicts(old: Schema, new: Schema):`
`122`	`122`	`"from read_parquet([{}]) "`
`123`	`123`	`"group by user_id, page "`
`124`	`124`	`"order by count(user_id) desc").format(`
`125`		`- ', '.join(list(map(lambda x: "'s3://" + ice.s3c.s3bucket + "/" + x.path + "'", alive_files)))`
	`125`	`+ ', '.join(list(map(lambda x: "'s3://" + ice.data_s3c.s3bucket + "/" + x.path + "'", alive_files)))`
`126`	`126`	`)`
`127`	`127`	`print(ddb.sql(query))`
`128`	`128`