DDCS: Helm Values#

name: ddcs

global:
  additionalLabels:

image:
  registry: nvcr.io
  repository: nvidia/omniverse/ddcs-dist-kv
  pullPolicy: IfNotPresent
  pullSecrets:

    - name: regcred
  tag: "latest"
  overrideTag: false

imagePullSecrets: []
nameOverride: ""
fullnameOverride: ""

# Whole cluster configuration
cluster:
  # The number of pods to deploy
  #
  # By default the pods have a scheduler preference to be placed on different nodes. If an OSSKV pod
  # is already deployed to a node, another will still be scheduled if possible to do so.
  # This allows the load to spread between nodes without making it impossible to scale up on a single node.
  #
  # When using networked storage classes like 'managed-csi-premium' on AKS or NVMesh classes, it may
  # benefit the deployment to have several replicas scheduled on the same node.
  # For example, in AKS *EACH* volume can likely burst to ~150MB/s.
  replicas: 1

  nodeSelector: {}

  tolerations: []

  # When true, the scheduler will prefer to place KV pods on nodes that do not already
  # have a KV store.
  selfAntiAffinity: true

  affinity: {}

  podAnnotations: {}

  podSecurityContext: {}
    # fsGroup: 2000

  # Controls an OpenTelemetry collector/operator deployment that can be used to
  # deploy sidcars in each pod. This sidecar will collect the metrics from the running service
  # and sends them to another OTEL collector.
  #
  # If enabled, this option will disable the ServiceMonitor.
  otelCollector:
    # When enabled `otelHost` and `otelPort` should be `localhost` and `4317`.
    enabled: false
    # The sidecar itself produces metrics, when true collect those metrics.
    includeCollectorMetrics: true
    batch:
    export:
      otlp:
        # Set this to the correct location of the collector in your cluster.
        endpoint: otel-collector.svc.svc.cluster.local:4317
        tls:
          insecure: true

  monitoring:
    labels:
    spec:

  # Configuration applied to each node.
  container:

    rustLog: "info"
    rustBacktrace: "full"

    securityContext: {}
      # capabilities:
      #   drop:
      #   - ALL
      # readOnlyRootFilesystem: true
      # runAsNonRoot: true
      # runAsUser: 1000

    resources:
      #limits:
      #  memory: 64G

    storage:
      volume:
        # When enabled the DB environment will reside on a k8s mounted volume.
        enabled: true
        # This number should be greater than what is provided in settings.size
        size: 300Gi
        # This is azure specific and should be changed to match a class available in your cluster.
        storageClassName: managed-csi

    settings:
      # Format to use when logging.
      # Allowed values are: human, human_no_color, json
      logFormat: "json"
      # The maximum amount of storage space the DB will target for use on the persistent volume.
      #
      # This includes all space necessary to maintain a WAL, SST files etc.
      storageLimit: 275G
      # Configuration for garbage collection.
      garbageCollection:
        # The minimum free capacity target for the target map.
        #
        # If the percentage of free map space falls below the given amount, GC
        # will begin.
        minFreeCapacity: 40
        # Once GC begins, the system will attempt to remove the given quantile of the entire
        # keyspace.
        #
        # If this number is 60, then 60% of keys will be deleted.
        deleteKeyspaceQuantile: 60
        # The interval on which to check the database capacity.
        checkDbCapacityMs: 1000
      # Settings for telemetry services.
      telemetry:
        # Enables or disables prometheus metric export.
        #
        # An http service is started on the specified port.
        prometheusMetricsExposition: true
        # The port to serve metrics on.
        prometheusMetricsPort: 3051
        # Enables or disables OTEL exposition.
        otelExposition: false
        # The OTEL service collection port.
        otelPort: 4317
        # The OTEL service host address.
        #
        # If otelCollector is enabled, this option should remain as `localhost`.
        otelHost: localhost
        # When true the IP env var must be specified.
        #
        # Useful for situations in which the host has the or pod runs an OTEL collector.
        otelPodIpAsHost: false
      # Engine specific configuration.
      engine:
        # Think of the row cache like a hashmap. When a key-pair is read it is placed in the map. The next lookup
        # of that key will be served by the row cache. The engine does not do a row scan for the data and no disk
        # io is necessary.
        # This is primarily where HOT data is served from.
        sys.cache_size: "32G"
        # Key/value pairs can also be served from the block cache. When the engine searches rows on disk it can keep
        # some amount of them cached in memory. This allows the next lookup to be fast if the data resides in the file
        # blocks that have been cached.
        # There is more that goes in the block cache, in general a large block cache can help reduce scan time.
        sys.block_cache_size: "8G"
        sys.block_cache_num_shard_bits: 8
        sys.increase_parallelism: 8
        sys.use_write_buffer_manager: 1

        # Should be equal to the number of threads.
        # Rocks db uses background jobs to flush data to disk
        # These two values should remain equal
        db.max_background_jobs: 8

        db.table_cache_numshardbits: 6

        db.allow_concurrent_memtable_write: true

        # This number limits the files that rocks will keep open. This is required because K8s
        # will consider the page files associated with open files as used memory by the POD. IE it will
        # the pod can be OOM killed for having too many open files.
        db.max_open_files: 128

        # Write buffers have a default size of 64mb
        # 128 * 64 MB = ~8GB of write capacity before writes are slowed to disk speed
        # When 1 buffer is full the engine switches to the next. When all buffers are full, writes are stalled until another
        # becomes available.
        cf.min_write_buffer_number_to_merge: 2
        cf.max_write_buffer_number: 128

        # Enables the usage of blob db
        cf.enable_blob_files: true
        cf.enable_blob_garbage_collection: true

        # Values of the given size are placed in new blob files instead of going through compaction.
        cf.min_blob_size: 1MB

        # Ensure that stale files are removed more often so that GC is not triggered when it does not need to be.
        # This is 3 minutes.
        #cf.delete_obsolete_files_period_micros: 180000000

        # Ensure that SST files are flowed through the compaction filter every 4 hours.
        cf.periodic_compaction_seconds: 14400
      # Configure the gRPC HTTP service.
      grpc:
        # Set the initial HTTP2 stream window size.
        # Must not be 0.
        initialStreamWindowSize: 512K
        # Set the initial HTTP2 connection window size.
        # Must not be 0.
        initialConnectionWindowSize: 32M
        # Max number of requests that can be operated on concurrently per connection.
        # Must not be 0.
        connectionConcurrencyLimit: 32
        # Max number of HTTP2 streams per connection.
        # 0 for unlimited.
        maxConcurrentStreams: 0
        # Max frame size for each HTTP2 data frame.
        # 0 for HTTP2 default.
        maxFrameSize: 0
        # Max time that a request can take.
        # Must not be 0.
        timeoutSeconds: 30
        # The amount of time to wait before timing out a proto buf write.
        writeTimeoutSeconds: 30
        # The amount of time in seconds before keepalive probes are sent.
        tcpKeepaliveAfterIdleSeconds: 5
        # The amount of time between each keepalive probe.
        tcpKeepaliveIntervalSeconds: 3
        # The number of probes to send before the socket is considered reset.
        tcpKeepaliveRetries: 15
        # The amount of time to wait between http2 keepalive probes.
        http2KeepaliveIntervalSeconds: 5
        # The amount of time to wait before connection is considered reset.
        http2KeepaliveTimeoutSeconds: 45
        # The max size that grpc service will allow.
        maxDecodingMessageSize: 5M
        # If true a tenant ID can be set through a string set in the metadata.
        tenantFromMetadata: false
        # TLS configuration options.
        tls:
          # If true, requires TLS/https encrypted transport
          enabled: false
          # Use existing kubernetes.io/tls secret
          secretName: ddcs-tls
          # The path to the certificate to use. Must be same directory as key.
          cert: "/tls/tls.crt"
          # The key for the certificate. Must be same directory as cert.
          key: "/tls/tls.key"
          # If true, the given root ca is used.
          includeCaRoot: false
          # The path to the ca root.
          caRoot: "/cert/path/ca.pem"
        # Controls verification of JWT
        jwt:
          # When enabled, any jwt provided in the authorization header will be validated
          enabled: false
          # When true, a valid JWT must be provided.
          require: false
          # The url to get the public JWK set from.
          jwkPublicKeysetUrl:

            - "https://example.com/jwk.json"
            - "test"
          # The interval on which to get the JWK set.
          jwkUpdateIntervalSecs: 1500
          # The size of the cross-connection cache for JWTs.
          cacheSizeMb: 128
          # When true the aud claim of the token is verified.
          verifyAud: false
          # Tokens will be required to have this value as their audience claim.
          audClaim: ""
          # When true verifies the expiration time of provided JWT.
          verifyExp: false
      # Settings that control how DDCS stores items in the storage engine.
      store:
        # The size after which values are treated as blobs.
        blobCutoff: 4M
        # The size of chunks to create when storing blobs.
        blobChunkSize: 4M
        mode: "legacy"

service:
  # Use loadbalancer for the service - allows external IP
  annotations: {}
  loadBalancer: false
  loadBalancerSourceRanges: []
  grpcPort: 3010
  metricsPort: 3051

monitoring:
  prometheusAlerts: true
  enabled: true
  interval: 5s
  path: /metrics
  port: http-metrics
  scheme: http
  scrapeTimeout: 5s