ElasticSearch - Organize search by priority with/without spaces

I am using elastic 7.15.0. I want to organize search by priority with/without spaces. What does it mean?

query - "surf coff"

I want to see records contains or startswith "surf coff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN)
When records contains or startswith "surf" - (SURF, SURF LOVE, ENDLESS SURF)
When records contains or startswith "coff" - (LOVE COFFEE, COFFEE MAN)

query - "surfcoff"

i want to see records contains or startswith "surfcoff" - (SURF COFFEE, SURF CAFFETERIA, SURFCOFFEE MAN) only.

I created the analyzer with filters:

lowercase
word_delimiter_graph
shingle
edge n gram
pattern replace for spaces

{
   "settings":{
       "index": {
            "max_shingle_diff" : 9,
            "max_ngram_diff": 9
       },
      "analysis":{
         "analyzer":{
            "word_join_analyzer":{
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "word_delimiter_graph",
                  "my_shingle",
                  "my_edge_ngram",
                  "my_char_filter"
               ]
            }
         },
         "filter":{
            "my_shingle":{
               "type":"shingle",
               "min_shingle_size": 2,
                "max_shingle_size": 10
            },
            "my_edge_ngram": { 
                "type": "edge_ngram",
                "min_gram": 2,
                "max_gram": 10,
                "token_chars": ["letter", "digit"]
            },
            "my_char_filter": {
                "type": "pattern_replace",
                "pattern": " ",
                "replacement": ""
            }
         }
      }
   }
}

So when i analyzed text = "SURF COFFEE", i got this result

{
    "tokens": [
        {
            "token": "su",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "sur",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 4,
            "type": "<ALPHANUM>",
            "position": 0
        },
        {
            "token": "su",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "sur",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surf",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfc",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfco",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcof",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcoff",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "surfcoffe",
            "start_offset": 0,
            "end_offset": 11,
            "type": "shingle",
            "position": 0,
            "positionLength": 2
        },
        {
            "token": "co",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "cof",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coff",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coffe",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        },
        {
            "token": "coffee",
            "start_offset": 5,
            "end_offset": 11,
            "type": "<ALPHANUM>",
            "position": 1
        }
    ]
}

As you can see, there is token "surfcoff".

How my search should be organized?

I've tried to combine approaches by bool should query with - query_string, match_phrase_prefix, match_prefix and others.

But none of them gave correct results.

Can you please, help me.

How my query should be built? Or maybe i should try other analyzer filters.

For example query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "surf coff",
                "default_field": "text",
                "default_operator": "AND"
            }
        },
        {
          "query_string": {
                "query": "surf",
                "default_field": "text"
            }
        },
        {
          "query_string": {
                "query": "coff",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or this query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "(surf coff) OR (surf) OR (coff)",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

or this query

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "((surf AND coff)^3 OR (surf)^2 OR (coff)^1)",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

{
  "query": {
    "match_bool_prefix" : {
      "text" : "surf coff"
    }
  }
}

gives

SURF COFFEE SURFING NEVER ALONE
CONOSUR COLCHAGUA CONO SUR
SUNRISE CONCHA TORO SUNRISE 300 DAYS
SUN COFFEE
SURF COFFEE PROPAGANDA ....

but its strange for me, i think i misunderstand something.

Solution 1:

{
  "query": {
    "bool": {
      "should": [
        {
          "query_string": {
                "query": "(surf* AND coff*)^3 OR (surf*)^2 OR (coff*)^1",
                "default_field": "text"
            }
        }
      ]
    }
  }
}

{
   "settings":{
       "index": {
            "max_shingle_diff" : 9,
            "max_ngram_diff": 9
       },
      "analysis":{
         "analyzer":{
            "word_join_analyzer":{
               "tokenizer":"standard",
               "filter":[
                  "lowercase",
                  "word_delimiter_graph",
                   "my_shingle",
                   "my_char_filter"
               ]
            }
         },
         "filter":{
            "my_shingle":{
               "type":"shingle",
               "min_shingle_size": 2,
                "max_shingle_size": 10
            },
            "my_char_filter": {
                "type": "pattern_replace",
                "pattern": " ",
                "replacement": ""
            }
         }
      }
   }
}

removing edge-n-gram and adding wilcard query with priority resolved my question. But I still don't understand why edge n gram didn't work.

ElasticSearch - Organize search by priority with/without spaces

Solution 1:

Related

Recent Posts