ElasticSearch - Organize search by priority with/without spaces

I am using elastic 7.15.0. I want to organize search by priority with/without spaces. What does it mean?

query - "surf coff"

  1. I want to see records contains or startswith "surf coff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN)
  2. When records contains or startswith "surf" - (SURF, SURF LOVE, ENDLESS SURF)
  3. When records contains or startswith "coff" - (LOVE COFFEE, COFFEE MAN)

query - "surfcoff"

  1. i want to see records contains or startswith "surfcoff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN) only.

I created the analyzer with filters:

  • lowercase
  • word_delimiter_graph
  • shingle
  • edge n gram
  • pattern replace for spaces
{
   "settings":{
       "index": {
            "max_shingle_diff" : 9,
            "max_ngram_diff": 9
       },
      "analysis":{
         "analyzer":{
            "word_join_analyzer":{
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "word_delimiter_graph",
                  "my_shingle",
                  "my_edge_ngram",
                  "my_char_filter"
               ]
            }
         },
         "filter":{
            "my_shingle":{
               "type":"shingle",
               "min_shingle_size": 2,
                "max_shingle_size": 10
            },
            "my_edge_ngram": { 
                "type": "edge_ngram",
                "min_gram": 2,
                "max_gram": 10,
                "token_chars": ["letter", "digit"]
            },
            "my_char_filter": {
                "type": "pattern_replace",
                "pattern": " ",
                "replacement": ""
            }
         }
      }
   }
}

So when i analyzed text = "SURF COFFEE", i got this result

{
    "tokens": [
        {
            "token": "su",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "sur",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "su",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "sur",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfc",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfco",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcof",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcoff",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcoffe",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "co",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "cof",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coff",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coffe",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coffee",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        }
    ]
}

As you can see, there is token "surfcoff".

How my search should be organized?

I've tried to combine approaches by bool should query with - query_string, match_phrase_prefix, match_prefix and others.

But none of them gave correct results.

Can you please, help me.

How my query should be built? Or maybe i should try other analyzer filters.

For example query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "surf coff",
                "default_field": "text",
                "default_operator": "AND"
            }
        },
        {
          "query_string": {
                "query": "surf",
                "default_field": "text"
            }
        },
        {
          "query_string": {
                "query": "coff",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or this query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "(surf coff) OR (surf) OR (coff)",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or this query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "((surf AND coff)^3 OR (surf)^2 OR (coff)^1)",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or

{
  "query": {
    "match_bool_prefix" : {
      "text" : "surf coff"
    }
  }
}

gives

  1. SURF COFFEE SURFING NEVER ALONE
  2. CONOSUR COLCHAGUA CONO SUR
  3. SUNRISE CONCHA TORO SUNRISE 300 DAYS
  4. SUN COFFEE
  5. SURF COFFEE PROPAGANDA ....

but its strange for me, i think i misunderstand something.


Solution 1:

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "(surf* AND coff*)^3 OR (surf*)^2 OR (coff*)^1",
                "default_field": "text"
            }
        }
      ]
    }
  }
}
{
   "settings":{
       "index": {
            "max_shingle_diff" : 9,
            "max_ngram_diff": 9
       },
      "analysis":{
         "analyzer":{
            "word_join_analyzer":{
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "word_delimiter_graph",
                   "my_shingle",
                   "my_char_filter"
               ]
            }
         },
         "filter":{
            "my_shingle":{
               "type":"shingle",
               "min_shingle_size": 2,
                "max_shingle_size": 10
            },
            "my_char_filter": {
                "type": "pattern_replace",
                "pattern": " ",
                "replacement": ""
            }
         }
      }
   }
}

removing edge-n-gram and adding wilcard query with priority resolved my question. But I still don't understand why edge n gram didn't work.