Skip to content

Commit d504b44

Browse files
authored
Internal observability agent (Datadog) + Cleanup Brainstore vars (#69)
* Default to true * Remove index validate vars. Add extra writer vars * Add support for internal datadog agent * Pass extra env vars for the writer * Add back in missing var * Uncomment so it is less confusing. Remove vars * Point at the alias rather than the version. Pointing at the version makes terraform apply slow. Pointing at the alias is faster since the provisioned config happens in the background. * Comment * Validate against the local module, not main * Add quarantine lambda delete script
1 parent b442e4e commit d504b44

File tree

11 files changed

+319
-125
lines changed

11 files changed

+319
-125
lines changed

examples/braintrust-data-plane/main.tf

Lines changed: 56 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -4,103 +4,113 @@ module "braintrust-data-plane" {
44
source = "github.com/braintrustdata/terraform-braintrust-data-plane"
55
# Append '?ref=<version_tag>' to lock to a specific version of the module.
66

7+
### Examples below are shown with the module defaults. You do not have to uncomment them
8+
### unless you want to change the default value.
9+
### The default values are for production-sized deployments.
10+
711
# This is primarily used for tagging and naming resources in your AWS account.
812
# Do not change this after deployment. RDS and S3 resources can not be renamed.
913
deployment_name = "braintrust"
1014

1115
# Add your organization name from the Braintrust UI here
1216
braintrust_org_name = ""
1317

14-
### Service Configuration
15-
# The maximum number of concurrent executions to reserve and constrain Braintrust lambdas to.
16-
# If you run Braintrust in a shared account you should set these to a reasonable limit to avoid
17-
# impacting other non-Braintrust Lambdas. AWS has a global shared limit of 1000 concurrent executions per account.
18-
# By default these are unlimited which is ideal for dedicated AWS account.
19-
# Recommended 100 to 1000 for production in a shared account.
20-
# api_handler_reserved_concurrent_executions = 100
21-
# ai_proxy_reserved_concurrent_executions = 100
22-
23-
# The number API Handler instances to provision and keep alive. This reduces cold start times and improves latency, with some increase in cost.
24-
# api_handler_provisioned_concurrency = 0
25-
2618
### Postgres configuration
27-
# postgres_instance_type = "db.r8g.2xlarge"
19+
# Changing this will incur a short downtime.
20+
postgres_instance_type = "db.r8g.2xlarge"
2821

29-
# Storage size (in GB) for the RDS instance.
30-
# postgres_storage_size = 1000
22+
# Initial storage size (in GB) for the RDS instance.
23+
postgres_storage_size = 1000
3124
# Maximum storage size (in GB) to allow the RDS instance to auto-scale to.
32-
# postgres_max_storage_size = 4000
25+
postgres_max_storage_size = 10000
3326

3427
# Storage type for the RDS instance. Recommended io2 for large production deployments.
35-
# postgres_storage_type = "gp3"
28+
postgres_storage_type = "gp3"
3629

3730
# Storage IOPS for the RDS instance. Only applicable if storage_type is io1, io2, or gp3.
38-
# Recommended 15000 for production. Default for gp3 is 3000.
39-
# postgres_storage_iops = 10000
31+
# Recommended 15000 for production.
32+
postgres_storage_iops = 15000
4033

4134
# Throughput for the RDS instance. Only applicable if storage_type is gp3.
4235
# Recommended 500 for production if you are using gp3. Leave blank for io1 or io2
43-
# postgres_storage_throughput = 500
36+
postgres_storage_throughput = 500
4437

4538
# PostgreSQL engine version for the RDS instance.
46-
# postgres_version = "15"
39+
postgres_version = "15"
4740

4841
# Automatic upgrades of PostgreSQL minor engine version.
4942
# If true, AWS will automatically upgrade the minor version of the PostgreSQL engine for you.
5043
# Note: Don't include the minor version in your postgres_version if you want to use this.
5144
# If false, you will need to manually upgrade the minor version of the PostgreSQL engine.
52-
# postgres_auto_minor_version_upgrade = true
45+
postgres_auto_minor_version_upgrade = true
5346

5447
# Multi-AZ RDS instance. Enabling increases cost but provides higher availability.
55-
# Recommended for critical production environments.
56-
# postgres_multi_az = true
48+
# Recommended for critical production environments. Doubles the cost of the RDS instance.
49+
# postgres_multi_az = false
5750

5851
### Brainstore configuration
5952
# The license key for the Brainstore instance. You can get this from the Braintrust UI in Settings > API URL.
6053
brainstore_license_key = var.brainstore_license_key
6154

6255
# The number of Brainstore reader instances to provision
6356
# Recommended Graviton instance type with 16GB of memory
64-
# brainstore_instance_count = 2
65-
# brainstore_instance_type = "c8gd.4xlarge"
57+
brainstore_instance_count = 2
58+
brainstore_instance_type = "c8gd.4xlarge"
6659

6760
# The number of dedicated Brainstore writer nodes to create
6861
# Recommended Graviton instance type with 32GB of memory
69-
# brainstore_writer_instance_count = 1
70-
# brainstore_writer_instance_type = "c8gd.8xlarge"
62+
brainstore_writer_instance_count = 1
63+
brainstore_writer_instance_type = "c8gd.8xlarge"
64+
7165

7266
### Redis configuration
67+
7368
# Default is acceptable for typical production deployments.
74-
# redis_instance_type = "cache.t4g.medium"
69+
redis_instance_type = "cache.t4g.medium"
7570

7671
# Redis engine version
77-
# redis_version = "7.0"
72+
redis_version = "7.0"
73+
7874

7975
### Network configuration
80-
# CIDR block for the VPC. You might need to adjust this so it does not conflict with any
81-
# other VPC CIDR blocks you intend to peer with Braintrust
76+
# WARNING: You should choose these values carefully after discussing with your networking team.
77+
# Changing them after the fact is not possible and will require a complete rebuild of your Braintrust deployment.
78+
79+
# CIDR block for the VPC. The core Braintrust services will be deployed in this VPC.
80+
# You might need to adjust this so it does not conflict with any other VPC CIDR blocks you intend to peer with Braintrust.
8281
# vpc_cidr = "10.175.0.0/21"
8382

8483
# CIDR block for the Quarantined VPC. This is used to run user defined functions in an isolated environment.
84+
# You might need to adjust this so it does not conflict with any other VPC CIDR blocks you intend to peer with Braintrust
8585
# quarantine_vpc_cidr = "10.175.8.0/21"
8686

87-
### Advanced configuration
88-
# List of origins to whitelist for CORS
89-
# whitelisted_origins = []
90-
91-
# Custom domain name for the CloudFront distribution
92-
# custom_domain = null
93-
94-
# ARN of the ACM certificate for the custom domain
95-
# custom_certificate_arn = null
9687

97-
# The maximum number of requests per user allowed in the time frame specified by outbound_rate_limit_window_minutes. Setting to 0 will disable rate limits
98-
# outbound_rate_limit_max_requests = 0
88+
### Advanced configuration
9989

100-
# The time frame in minutes over which rate per-user rate limits are accumulated
101-
# outbound_rate_limit_window_minutes = 1
90+
# The maximum number of concurrent executions to reserve and constrain Braintrust lambdas to.
91+
# If you run Braintrust in a dedicated account you can leave these at "-1" (unlimited).
92+
# If you run Braintrust in a shared account you should set these to a reasonable limit to avoid
93+
# impacting other non-Braintrust Lambdas. Recommended 100 to 1000 for production in a shared account.
94+
# api_handler_reserved_concurrent_executions = -1
95+
# ai_proxy_reserved_concurrent_executions = -1
96+
97+
# Uncomment these to set extra environment variables for the services.
98+
# Only use this when instructed to by the Braintrust team.
99+
# brainstore_extra_env_vars = {}
100+
#
101+
# brainstore_extra_env_vars_writer = {}
102+
#
103+
# service_extra_env_vars = {
104+
# APIHandler = {}
105+
# AIProxy = {}
106+
# CatchupETL = {}
107+
# MigrateDatabaseFunction = {}
108+
# QuarantineWarmupFunction = {}
109+
# }
110+
111+
112+
### Braintrust Remote Support
102113

103-
### Braintrust Support
104114
# Enable sharing of Cloudwatch logs with Braintrust staff
105115
# enable_braintrust_support_logs_access = true
106116

main.tf

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -164,26 +164,29 @@ module "brainstore" {
164164
source = "./modules/brainstore"
165165
count = var.enable_brainstore ? 1 : 0
166166

167-
deployment_name = var.deployment_name
168-
instance_count = var.brainstore_instance_count
169-
instance_type = var.brainstore_instance_type
170-
instance_key_pair_name = var.brainstore_instance_key_pair_name
171-
port = var.brainstore_port
172-
license_key = var.brainstore_license_key
173-
version_override = var.brainstore_version_override
174-
s3_bucket_retention_days = var.brainstore_s3_bucket_retention_days
175-
extra_env_vars = var.brainstore_extra_env_vars
176-
writer_instance_count = var.brainstore_writer_instance_count
177-
writer_instance_type = var.brainstore_writer_instance_type
178-
brainstore_disable_optimization_worker = var.brainstore_disable_optimization_worker
179-
brainstore_vacuum_all_objects = var.brainstore_vacuum_all_objects
180-
brainstore_enable_index_validation = var.brainstore_enable_index_validation
181-
brainstore_index_validation_only_deletes = var.brainstore_index_validation_only_deletes
182-
database_host = module.database.postgres_database_address
183-
database_port = module.database.postgres_database_port
184-
database_secret_arn = module.database.postgres_database_secret_arn
185-
redis_host = module.redis.redis_endpoint
186-
redis_port = module.redis.redis_port
167+
deployment_name = var.deployment_name
168+
instance_count = var.brainstore_instance_count
169+
instance_type = var.brainstore_instance_type
170+
instance_key_pair_name = var.brainstore_instance_key_pair_name
171+
port = var.brainstore_port
172+
license_key = var.brainstore_license_key
173+
version_override = var.brainstore_version_override
174+
s3_bucket_retention_days = var.brainstore_s3_bucket_retention_days
175+
extra_env_vars = var.brainstore_extra_env_vars
176+
extra_env_vars_writer = var.brainstore_extra_env_vars_writer
177+
writer_instance_count = var.brainstore_writer_instance_count
178+
writer_instance_type = var.brainstore_writer_instance_type
179+
brainstore_disable_optimization_worker = var.brainstore_disable_optimization_worker
180+
brainstore_vacuum_all_objects = var.brainstore_vacuum_all_objects
181+
database_host = module.database.postgres_database_address
182+
database_port = module.database.postgres_database_port
183+
database_secret_arn = module.database.postgres_database_secret_arn
184+
redis_host = module.redis.redis_endpoint
185+
redis_port = module.redis.redis_port
186+
187+
internal_observability_api_key = var.internal_observability_api_key
188+
internal_observability_env_name = var.internal_observability_env_name
189+
internal_observability_region = var.internal_observability_region
187190

188191
vpc_id = module.main_vpc.vpc_id
189192
security_group_id = module.main_vpc.default_security_group_id

mise.toml

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,27 @@ terraform-docs = "latest"
88
[tasks]
99
lint = ["terraform fmt -recursive", "tflint --recursive"]
1010
setup = ["pre-commit install", "tflint --init"]
11-
validate = [
12-
"terraform init && terraform validate",
13-
"cd examples/braintrust-data-plane && terraform init && terraform validate",
14-
]
1511
precommit = ["pre-commit run --all-files"]
12+
13+
[tasks.validate]
14+
description = "Validate the Terraform module and example code"
15+
run = """
16+
#!/usr/bin/env bash
17+
set -e
18+
echo "Validating module"
19+
terraform init
20+
terraform validate
21+
22+
echo "Validating example code"
23+
cd examples/braintrust-data-plane
24+
# Override the module source to point to the local module.
25+
# '*_override.tf' is a lesser known native terraform feature
26+
trap 'rm -f main_override.tf' EXIT
27+
cat <<EOF> main_override.tf
28+
module "braintrust-data-plane" {
29+
source = "../../"
30+
}
31+
EOF
32+
terraform init
33+
terraform validate
34+
"""

modules/brainstore/main-writer.tf

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -36,24 +36,25 @@ resource "aws_launch_template" "brainstore_writer" {
3636
}
3737

3838
user_data = base64encode(templatefile("${path.module}/templates/user_data.sh.tpl", {
39-
aws_region = data.aws_region.current.name
40-
deployment_name = var.deployment_name
41-
database_secret_arn = var.database_secret_arn
42-
database_host = var.database_host
43-
database_port = var.database_port
44-
redis_host = var.redis_host
45-
redis_port = var.redis_port
46-
brainstore_port = var.port
47-
brainstore_s3_bucket = aws_s3_bucket.brainstore.id
48-
brainstore_license_key = var.license_key
49-
brainstore_version_override = var.version_override == null ? "" : var.version_override
50-
brainstore_release_version = local.brainstore_release_version
51-
brainstore_disable_optimization_worker = var.brainstore_disable_optimization_worker
52-
brainstore_vacuum_all_objects = var.brainstore_vacuum_all_objects
53-
brainstore_enable_index_validation = var.brainstore_enable_index_validation
54-
brainstore_index_validation_only_deletes = var.brainstore_index_validation_only_deletes
55-
is_dedicated_writer_node = "true"
56-
extra_env_vars = var.extra_env_vars
39+
aws_region = data.aws_region.current.name
40+
deployment_name = var.deployment_name
41+
database_secret_arn = var.database_secret_arn
42+
database_host = var.database_host
43+
database_port = var.database_port
44+
redis_host = var.redis_host
45+
redis_port = var.redis_port
46+
brainstore_port = var.port
47+
brainstore_s3_bucket = aws_s3_bucket.brainstore.id
48+
brainstore_license_key = var.license_key
49+
brainstore_version_override = var.version_override == null ? "" : var.version_override
50+
brainstore_release_version = local.brainstore_release_version
51+
brainstore_disable_optimization_worker = var.brainstore_disable_optimization_worker
52+
brainstore_vacuum_all_objects = var.brainstore_vacuum_all_objects
53+
is_dedicated_writer_node = "true"
54+
extra_env_vars = var.extra_env_vars_writer
55+
internal_observability_api_key = var.internal_observability_api_key
56+
internal_observability_env_name = var.internal_observability_env_name
57+
internal_observability_region = var.internal_observability_region
5758
}))
5859

5960
tags = merge({

modules/brainstore/main.tf

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -56,12 +56,13 @@ resource "aws_launch_template" "brainstore" {
5656
brainstore_version_override = var.version_override == null ? "" : var.version_override
5757
brainstore_release_version = local.brainstore_release_version
5858
# Important note: if there are no dedicated writer nodes, this node serves as a read/writer node
59-
brainstore_disable_optimization_worker = local.has_writer_nodes ? true : var.brainstore_disable_optimization_worker
60-
brainstore_vacuum_all_objects = local.has_writer_nodes ? false : var.brainstore_vacuum_all_objects
61-
brainstore_enable_index_validation = var.brainstore_enable_index_validation
62-
brainstore_index_validation_only_deletes = var.brainstore_index_validation_only_deletes
63-
is_dedicated_writer_node = "false"
64-
extra_env_vars = var.extra_env_vars
59+
brainstore_disable_optimization_worker = local.has_writer_nodes ? true : var.brainstore_disable_optimization_worker
60+
brainstore_vacuum_all_objects = local.has_writer_nodes ? false : var.brainstore_vacuum_all_objects
61+
is_dedicated_writer_node = "false"
62+
extra_env_vars = var.extra_env_vars
63+
internal_observability_api_key = var.internal_observability_api_key
64+
internal_observability_env_name = var.internal_observability_env_name
65+
internal_observability_region = var.internal_observability_region
6566
}))
6667

6768
tags = merge({

modules/brainstore/templates/user_data.sh.tpl

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,8 +104,6 @@ BRAINSTORE_CACHE_DIR=/mnt/tmp/brainstore
104104
BRAINSTORE_LICENSE_KEY=${brainstore_license_key}
105105
BRAINSTORE_DISABLE_OPTIMIZATION_WORKER=${brainstore_disable_optimization_worker}
106106
BRAINSTORE_VACUUM_OBJECT_ALL=${brainstore_vacuum_all_objects}
107-
BRAINSTORE_INDEX_WRITER_VALIDATE=${brainstore_enable_index_validation}
108-
BRAINSTORE_INDEX_WRITER_VALIDATE_ONLY_DELETES=${brainstore_index_validation_only_deletes}
109107
NO_COLOR=1
110108
AWS_DEFAULT_REGION=${aws_region}
111109
AWS_REGION=${aws_region}
@@ -119,6 +117,42 @@ if [ "${is_dedicated_writer_node}" = "true" ]; then
119117
echo '0 * * * * root /usr/bin/docker restart brainstore > /var/log/brainstore-restart.log 2>&1' > /etc/cron.d/restart-brainstore
120118
fi
121119

120+
if [ -n "${internal_observability_api_key}" ]; then
121+
if [ -n "${internal_observability_env_name}" ]; then
122+
export DD_ENV="${internal_observability_env_name}"
123+
fi
124+
# Install Datadog Agent
125+
export DD_API_KEY="${internal_observability_api_key}"
126+
export DD_SITE="${internal_observability_region}.datadoghq.com"
127+
export DD_APM_INSTRUMENTATION_ENABLED=host
128+
export DD_APM_INSTRUMENTATION_LIBRARIES=java:1,python:3,js:5,php:1,dotnet:3
129+
bash -c "$(curl -L https://install.datadoghq.com/scripts/install_script_agent7.sh)"
130+
usermod -a -G docker dd-agent
131+
132+
cat <<EOF > /etc/datadog-agent/environment
133+
DD_OTLP_CONFIG_RECEIVER_PROTOCOLS_HTTP_ENDPOINT=0.0.0.0:4318
134+
DD_COLLECT_EC2_TAGS=true
135+
DD_COLLECT_EC2_TAGS_USE_IMDS=true
136+
EOF
137+
# Configure Datadog Agent to collect Docker logs
138+
cat <<EOF >> /etc/datadog-agent/datadog.yaml
139+
logs_enabled: true
140+
listeners:
141+
- name: docker
142+
config_providers:
143+
- name: docker
144+
polling: true
145+
logs_config:
146+
container_collect_all: true
147+
EOF
148+
# Configure Brainstore to send traces to Datadog
149+
cat <<EOF >> /etc/brainstore.env
150+
BRAINSTORE_OTLP_HTTP_ENDPOINT=http://localhost:4318
151+
EOF
152+
# Restart Datadog Agent to pick up new configuration
153+
systemctl restart datadog-agent
154+
fi
155+
122156
BRAINSTORE_RELEASE_VERSION=${brainstore_release_version}
123157
BRAINSTORE_VERSION_OVERRIDE=${brainstore_version_override}
124158
BRAINSTORE_VERSION=$${BRAINSTORE_VERSION_OVERRIDE:-$${BRAINSTORE_RELEASE_VERSION}}

0 commit comments

Comments
 (0)