Elasticsearch & OpenSearch Reference

Query DSL, index management, aggregations, mappings, and the production patterns that actually matter — covering both Elasticsearch 8.x and OpenSearch 2.x.

Index operations and cluster health

# Cluster health
GET _cluster/health
GET _cluster/health?wait_for_status=yellow&timeout=30s

# Node info
GET _cat/nodes?v
GET _nodes/stats/jvm,os,process

# Index operations
GET _cat/indices?v&s=index
GET _cat/indices?v&h=index,docs.count,store.size,health
GET _cat/indices?v&index=my-logs-*    # wildcard

# Create index with settings
PUT my-index
{
  "settings": {
    "number_of_shards":   3,
    "number_of_replicas": 1,
    "refresh_interval":   "5s",       # default 1s — increase for heavy indexing
    "index.max_result_window": 10000  # default — never raise this, use search_after
  }
}

# Delete index
DELETE my-index
DELETE my-logs-2025-*     # wildcard

# Index aliases
POST _aliases
{
  "actions": [
    { "add": { "index": "my-index-v2", "alias": "my-index" } },
    { "remove": { "index": "my-index-v1", "alias": "my-index" } }
  ]
}

# Zero-downtime reindex
POST _reindex
{
  "source": { "index": "old-index" },
  "dest":   { "index": "new-index" }
}

# Shard allocation
GET _cat/shards?v&h=index,shard,prirep,state,node
GET _cluster/allocation/explain    # why is a shard unassigned?

Mappings — field types

# Mappings define how fields are indexed
PUT my-index
{
  "mappings": {
    "properties": {
      "id":          { "type": "keyword" },          # exact match only, not analyzed
      "title":       { "type": "text",               # full-text searched, analyzed
                       "analyzer": "english",         # stemming, stop words
                       "fields": {
                         "raw": { "type": "keyword" } # also available as keyword
                       }},
      "price":       { "type": "float" },
      "quantity":    { "type": "integer" },
      "active":      { "type": "boolean" },
      "created_at":  { "type": "date",
                       "format": "strict_date_optional_time||epoch_millis" },
      "tags":        { "type": "keyword" },          # array of keywords — same type
      "description": { "type": "text", "index": false }, # stored but not searchable
      "location":    { "type": "geo_point" },        # lat/lon for geo queries
      "metadata": {
        "type": "object",
        "properties": {
          "source": { "type": "keyword" },
          "score":  { "type": "float" }
        }
      }
    }
  }
}

# text vs keyword — the key decision
# keyword: filter, sort, aggregations, exact match
# text: full-text search (analyzed — stemmed, tokenized)
# Use both via multi-fields when you need both behaviors

# Dynamic mapping gotcha:
# First document sets the type for a field
# Sending { "price": "10.99" } when price is float → mapping conflict error
# Use strict dynamic mapping to prevent surprises:
PUT my-index/_mapping
{
  "dynamic": "strict"   # reject unknown fields
  // "dynamic": false   // ignore unknown fields
  // "dynamic": true    // (default) auto-map unknown fields
}

# View current mapping
GET my-index/_mapping
GET my-index/_mapping/field/title

Query DSL — search queries

# Basic search
GET my-index/_search
{
  "query": {
    "match": { "title": "elasticsearch tutorial" }    # full-text, analyzed
  }
}

# Exact match (keyword field)
GET my-index/_search
{
  "query": {
    "term": { "status": "published" }      # keyword field
  }
}

# Multiple terms
{
  "query": {
    "terms": { "status": ["published", "featured"] }
  }
}

# Range query
{
  "query": {
    "range": {
      "created_at": {
        "gte": "2026-01-01",
        "lte": "now",
        "format": "strict_date_optional_time"
      }
    }
  }
}

# Boolean query — the workhorse
{
  "query": {
    "bool": {
      "must": [                            # AND — affects score
        { "match": { "title": "elasticsearch" } },
        { "match": { "body": "tutorial" } }
      ],
      "filter": [                          # AND — does NOT affect score (faster!)
        { "term": { "status": "published" } },
        { "range": { "created_at": { "gte": "now-30d" } } }
      ],
      "must_not": [                        # NOT
        { "term": { "draft": true } }
      ],
      "should": [                          # OR — boosts score if matches
        { "term": { "featured": true } }
      ],
      "minimum_should_match": 1            # require at least 1 should to match
    }
  }
}

# Multi-match — search across multiple fields
{
  "query": {
    "multi_match": {
      "query": "elasticsearch guide",
      "fields": ["title^3", "body", "tags"],   # ^3 = boost title 3x
      "type": "best_fields"                     // cross_fields, most_fields, phrase
    }
  }
}

# Prefix and wildcard (avoid on large datasets!)
{ "query": { "prefix": { "username": "ale" } } }         # starts with
{ "query": { "wildcard": { "email": "*@gmail.com" } } }   # expensive!

# Exists — field is present and not null
{ "query": { "exists": { "field": "email" } } }

# Pagination (prefer search_after over from/size for deep pagination)
{
  "size": 20,
  "from": 0,                              # avoid from > 10000
  "sort": [{ "created_at": "desc" }],
  "search_after": ["2026-03-01T00:00:00"] # cursor-based (use last hit's sort values)
}

Aggregations

# Aggregations run on the full result set (not paginated)
# "size": 0 skips returning hits — faster for aggregation-only queries

GET orders/_search
{
  "size": 0,
  "query": {
    "range": { "created_at": { "gte": "now-30d" } }  # filter first!
  },
  "aggs": {
    "by_status": {
      "terms": {                          # bucket by unique values
        "field": "status",
        "size": 10,                       # top 10 by count
        "order": { "_count": "desc" }
      }
    },
    "revenue": {
      "sum": { "field": "amount" }        # metric aggregation
    },
    "avg_order": {
      "avg": { "field": "amount" }
    },
    "orders_over_time": {
      "date_histogram": {                 # time-series buckets
        "field": "created_at",
        "calendar_interval": "day",
        "time_zone": "Europe/London"
      },
      "aggs": {                           # nested aggregation
        "daily_revenue": { "sum": { "field": "amount" } }
      }
    },
    "price_ranges": {
      "range": {
        "field": "amount",
        "ranges": [
          { "to": 25 },
          { "from": 25, "to": 100 },
          { "from": 100 }
        ]
      }
    },
    "percentiles": {
      "percentiles": {
        "field": "amount",
        "percents": [50, 75, 95, 99]
      }
    }
  }
}

# Cardinality — approximate distinct count
{
  "aggs": {
    "unique_users": {
      "cardinality": {
        "field": "user_id",
        "precision_threshold": 1000    # higher = more accurate, more memory
      }
    }
  }
}

Index lifecycle management (ILM)

# ILM — automatically manage index lifecycle for time-series data (logs, metrics)

PUT _ilm/policy/my-logs-policy
{
  "policy": {
    "phases": {
      "hot": {
        "min_age": "0ms",
        "actions": {
          "rollover": {
            "max_primary_shard_size": "50gb",
            "max_age": "1d"
          },
          "set_priority": { "priority": 100 }
        }
      },
      "warm": {
        "min_age": "3d",
        "actions": {
          "forcemerge": { "max_num_segments": 1 },
          "shrink": { "number_of_shards": 1 },
          "allocate": { "number_of_replicas": 1 }
        }
      },
      "cold": {
        "min_age": "30d",
        "actions": {
          "allocate": { "number_of_replicas": 0 }
        }
      },
      "delete": {
        "min_age": "90d",
        "actions": { "delete": {} }
      }
    }
  }
}

# Index template — applied to new indices matching a pattern
PUT _index_template/my-logs-template
{
  "index_patterns": ["my-logs-*"],
  "template": {
    "settings": {
      "index.lifecycle.name":   "my-logs-policy",
      "index.lifecycle.rollover_alias": "my-logs"
    }
  },
  "priority": 200
}

# Bootstrap first index and alias
PUT my-logs-000001
{
  "aliases": {
    "my-logs": { "is_write_index": true }
  }
}

# Now write to alias "my-logs" — ILM handles rollover automatically

Performance and diagnostics

# Slow log — queries taking > threshold
PUT my-index/_settings
{
  "index.search.slowlog.threshold.query.warn":  "10s",
  "index.search.slowlog.threshold.query.info":  "5s",
  "index.search.slowlog.threshold.query.debug": "2s",
  "index.search.slowlog.level": "info"
}

# Hot threads — what is Elasticsearch actually doing?
GET _nodes/hot_threads

# Index stats — which index is slow?
GET _stats/search,indexing,store
GET my-index/_stats

# Fielddata / query cache
GET _nodes/stats/indices/fielddata?fields=*    # fielddata memory by field
GET _nodes/stats/indices/query_cache

# Profile API — diagnose slow queries
GET my-index/_search
{
  "profile": true,
  "query": { "match": { "title": "elasticsearch" } }
}
# Returns timing breakdown per shard/query/collector

# Common performance patterns
# 1. Always filter before query in bool (filter context = no scoring = cached)
# 2. Use keyword for sort/agg fields (text field sort requires fielddata = memory hog)
# 3. search_after for deep pagination (from/size is O(n) memory per shard)
# 4. _source filtering — only fetch fields you need
GET my-index/_search
{
  "_source": ["title", "created_at"],    # don't fetch large body field
  "query": { "match": { "title": "search" } }
}
# 5. Doc values disabled? Never disable for keyword/numeric — needed for sort/agg
# 6. Refresh interval: "30s" for heavy indexing (default 1s causes many small segments)

Useful one-liners

# Check if document exists
HEAD my-index/_doc/doc-id   # 200 = exists, 404 = not found

# Get a document
GET my-index/_doc/doc-id

# Bulk indexing (newline-delimited — MUST end with \n)
POST _bulk
{ "index": { "_index": "my-index", "_id": "1" } }
{ "title": "Document one", "created_at": "2026-03-14" }
{ "index": { "_index": "my-index", "_id": "2" } }
{ "title": "Document two", "created_at": "2026-03-14" }

# Update by query
POST my-index/_update_by_query
{
  "query": { "term": { "status": "draft" } },
  "script": {
    "source": "ctx._source.status = 'archived'",
    "lang": "painless"
  }
}

# Delete by query
POST my-index/_delete_by_query
{
  "query": { "range": { "created_at": { "lt": "now-365d" } } }
}

# Reindex with script transform
POST _reindex
{
  "source": { "index": "old-index", "size": 1000 },
  "dest":   { "index": "new-index" },
  "script": {
    "source": "ctx._source.full_name = ctx._source.first + ' ' + ctx._source.last",
    "lang": "painless"
  }
}

# Cat APIs for quick ops
GET _cat/health?v
GET _cat/indices?v&s=docs.count:desc    # sorted by doc count
GET _cat/aliases?v
GET _cat/tasks?v                        # running tasks
GET _cat/thread_pool?v&h=name,active,queue,rejected

🔍 Free tool: PyPI Package Health Checker — check elasticsearch-py, opensearch-py, and related Python packages for known CVEs and active maintenance.