レシピ - 正確な検索とステミングの混合（Mixing exact search with stemming）

ステミングを用いた正確な検索の混合

ステミングを用いた正確な検索の混合

検索アプリケーションを構築する際、ステミングはしばしば必須です。skiing に対するクエリが ski や skis を含む文書と一致することが望ましいからです。しかし、ユーザーが特に skiing を検索したい場合はどうでしょうか？これを行う一般的な方法は、同じコンテンツを異なる方法でインデックス化するために multi-field を使用することです。

Python

resp = client.indices.create(
   index="index",
   settings={
   "analysis": {
   "analyzer": {
   "english_exact": {
   "tokenizer": "standard",
   "filter": [
   "lowercase"
   ]
   }
   }
   }
   },
   mappings={
   "properties": {
   "body": {
   "type": "text",
   "analyzer": "english",
   "fields": {
   "exact": {
   "type": "text",
   "analyzer": "english_exact"
   }
   }
   }
   }
   },
)
print(resp)
resp1 = client.index(
   index="index",
   id="1",
   document={
   "body": "Ski resort"
   },
)
print(resp1)
resp2 = client.index(
   index="index",
   id="2",
   document={
   "body": "A pair of skis"
   },
)
print(resp2)
resp3 = client.indices.refresh(
   index="index",
)
print(resp3)

Ruby

response = client.indices.create(
  index: 'index',
  body: {
   settings: {
   analysis: {
   analyzer: {
   english_exact: {
   tokenizer: 'standard',
   filter: [
   'lowercase'
   ]
   }
   }
   }
   },
   mappings: {
   properties: {
   body: {
   type: 'text',
   analyzer: 'english',
   fields: {
   exact: {
   type: 'text',
   analyzer: 'english_exact'
   }
   }
   }
   }
   }
  }
)
puts response
response = client.index(
  index: 'index',
  id: 1,
  body: {
   body: 'Ski resort'
  }
)
puts response
response = client.index(
  index: 'index',
  id: 2,
  body: {
   body: 'A pair of skis'
  }
)
puts response
response = client.indices.refresh(
  index: 'index'
)
puts response

Js

const response = await client.indices.create({
  index: "index",
  settings: {
   analysis: {
   analyzer: {
   english_exact: {
   tokenizer: "standard",
   filter: ["lowercase"],
   },
   },
   },
  },
  mappings: {
   properties: {
   body: {
   type: "text",
   analyzer: "english",
   fields: {
   exact: {
   type: "text",
   analyzer: "english_exact",
   },
   },
   },
   },
  },
});
console.log(response);
const response1 = await client.index({
  index: "index",
  id: 1,
  document: {
   body: "Ski resort",
  },
});
console.log(response1);
const response2 = await client.index({
  index: "index",
  id: 2,
  document: {
   body: "A pair of skis",
  },
});
console.log(response2);
const response3 = await client.indices.refresh({
  index: "index",
});
console.log(response3);

コンソール

PUT index
{
  "settings": {
   "analysis": {
   "analyzer": {
   "english_exact": {
   "tokenizer": "standard",
   "filter": [
   "lowercase"
   ]
   }
   }
   }
  },
  "mappings": {
   "properties": {
   "body": {
   "type": "text",
   "analyzer": "english",
   "fields": {
   "exact": {
   "type": "text",
   "analyzer": "english_exact"
   }
   }
   }
   }
  }
}
PUT index/_doc/1
{
  "body": "Ski resort"
}
PUT index/_doc/2
{
  "body": "A pair of skis"
}
POST index/_refresh

このような設定では、ski を body で検索すると、両方の文書が返されます：

Python

resp = client.search(
   index="index",
   query={
   "simple_query_string": {
   "fields": [
   "body"
   ],
   "query": "ski"
   }
   },
)
print(resp)

Ruby

response = client.search(
  index: 'index',
  body: {
   query: {
   simple_query_string: {
   fields: [
   'body'
   ],
   query: 'ski'
   }
   }
  }
)
puts response

Js

const response = await client.search({
  index: "index",
  query: {
   simple_query_string: {
   fields: ["body"],
   query: "ski",
   },
  },
});
console.log(response);

コンソール

GET index/_search
{
  "query": {
   "simple_query_string": {
   "fields": [ "body" ],
   "query": "ski"
   }
  }
}

コンソール-結果

{
  "took": 2,
  "timed_out": false,
  "_shards": {
   "total": 1,
   "successful": 1,
   "skipped" : 0,
   "failed": 0
  },
  "hits": {
   "total" : {
   "value": 2,
   "relation": "eq"
   },
   "max_score": 0.18232156,
   "hits": [
   {
   "_index": "index",
   "_id": "1",
   "_score": 0.18232156,
   "_source": {
   "body": "Ski resort"
   }
   },
   {
   "_index": "index",
   "_id": "2",
   "_score": 0.18232156,
   "_source": {
   "body": "A pair of skis"
   }
   }
   ]
  }
}

一方、ski を body.exact で検索すると、1 の文書のみが返されます。なぜなら、body.exact の分析チェーンはステミングを行わないからです。

Python

resp = client.search(
   index="index",
   query={
   "simple_query_string": {
   "fields": [
   "body.exact"
   ],
   "query": "ski"
   }
   },
)
print(resp)

Ruby

response = client.search(
  index: 'index',
  body: {
   query: {
   simple_query_string: {
   fields: [
   'body.exact'
   ],
   query: 'ski'
   }
   }
  }
)
puts response

Js

const response = await client.search({
  index: "index",
  query: {
   simple_query_string: {
   fields: ["body.exact"],
   query: "ski",
   },
  },
});
console.log(response);

コンソール

GET index/_search
{
  "query": {
   "simple_query_string": {
   "fields": [ "body.exact" ],
   "query": "ski"
   }
  }
}

コンソール-結果

{
  "took": 1,
  "timed_out": false,
  "_shards": {
   "total": 1,
   "successful": 1,
   "skipped" : 0,
   "failed": 0
  },
  "hits": {
   "total" : {
   "value": 1,
   "relation": "eq"
   },
   "max_score": 0.8025915,
   "hits": [
   {
   "_index": "index",
   "_id": "1",
   "_score": 0.8025915,
   "_source": {
   "body": "Ski resort"
   }
   }
   ]
  }
}

これはエンドユーザーに簡単に公開できるものではありません。なぜなら、彼らが正確な一致を探しているのかどうかを判断し、それに応じて適切なフィールドにリダイレクトする方法が必要だからです。また、クエリの一部だけを正確に一致させ、他の部分はステミングを考慮する必要がある場合はどうすればよいのでしょうか？

幸いなことに、query_string と simple_query_string のクエリには、この正確な問題を解決する機能があります：quote_field_suffix。これは、引用符の間に現れる単語が異なるフィールドにリダイレクトされるべきであることを Elasticsearch に伝えます。以下を参照してください：

Python

resp = client.search(
   index="index",
   query={
   "simple_query_string": {
   "fields": [
   "body"
   ],
   "quote_field_suffix": ".exact",
   "query": "\"ski\""
   }
   },
)
print(resp)

Ruby

response = client.search(
  index: 'index',
  body: {
   query: {
   simple_query_string: {
   fields: [
   'body'
   ],
   quote_field_suffix: '.exact',
   query: '"ski"'
   }
   }
  }
)
puts response

Js

const response = await client.search({
  index: "index",
  query: {
   simple_query_string: {
   fields: ["body"],
   quote_field_suffix: ".exact",
   query: '"ski"',
   },
  },
});
console.log(response);

コンソール

GET index/_search
{
  "query": {
   "simple_query_string": {
   "fields": [ "body" ],
   "quote_field_suffix": ".exact",
   "query": "\"ski\""
   }
  }
}

コンソール-結果

{
  "took": 2,
  "timed_out": false,
  "_shards": {
   "total": 1,
   "successful": 1,
   "skipped" : 0,
   "failed": 0
  },
  "hits": {
   "total" : {
   "value": 1,
   "relation": "eq"
   },
   "max_score": 0.8025915,
   "hits": [
   {
   "_index": "index",
   "_id": "1",
   "_score": 0.8025915,
   "_source": {
   "body": "Ski resort"
   }
   }
   ]
  }
}

上記のケースでは、ski が引用符の間にあったため、quote_field_suffix パラメータにより body.exact フィールドで検索され、1 の文書のみが一致しました。これにより、ユーザーは正確な検索とステミング検索を自由に混合できます。

quote_field_suffix に渡されたフィールドが存在しない場合、検索はクエリ文字列のデフォルトフィールドを使用することにフォールバックします。