Elastic Search Learning Notes-Vector Vector Search Records

In Elastic search 7.0, ES introduces field types of high-dimensional vectors:

Dense_vector stores dense vectors. Value is a single float value, which can be zero, negative or positive. The maximum length of the dense_vector array can not exceed 1024. The array length of each document can be different.

sparse_vector stores sparse vectors. Value is a single float value, which can be zero, negative or positive. sparse_vector stores a non-nested json object. key is the position of the vector, that is, the string of integer type, with a range of [0,65535].

Elastic Search version: elastic search-7.3.0

Environmental preparation:

curl -H "Content-Type: application/json" -XPUT 'http://192.168.0.1:9200/article_v1/' -d '
{
  "settings": {
    "number_of_shards": 1,
    "number_of_replicas": 0
  },
  "mappings": {
    "dynamic": "strict",
    "properties": {
      "id": {
        "type": "keyword"
      },
      "title": {
        "analyzer": "ik_smart",
        "type": "text"
      },
      "title_dv": {
        "type": "dense_vector",
        "dims": 200
      },
      "title_sv": {
        "type": "sparse_vector"
      }
    }
  }
}
'

Test validation code:

# -*- coding:utf-8 -*-

import os
import sys
import jieba
import logging
import pymongo
from elasticsearch import Elasticsearch
from elasticsearch.serializer import TextSerializer, JSONSerializer
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)

# Crawl some news on the Internet and store it in the database
client = pymongo.MongoClient(host='192.168.0.1', port=27017)
db = client['news']

es = Elasticsearch([{'host': '192.168.0.1', 'port': 9200}], timeout=3600)

chinese_stop_words_file = os.path.abspath(os.getcwd() + os.sep + '..' + os.sep + 'static' + os.sep + 'dic' + os.sep + 'chinese_stop_words.txt')
chinese_stop_words = [line.strip() for line in open(chinese_stop_words_file, 'r').readlines()]

total_cut_word_count = 0


# Sentence segmentation
def sentence_segment(sentence):
    global total_cut_word_count
    result = []
    cut_words = jieba.cut(sentence)
    for cut_word in cut_words:
        if cut_word not in chinese_stop_words:
            result.append(cut_word)
            total_cut_word_count += 1
    return result


# Preparing Corpus
def prepare_doc_corpus():
    datas = db['netease_ent_news_detail'].find({"create_time": {"$ne": None}}).sort('create_time', pymongo.ASCENDING)
    print datas.count()
    for i, data in enumerate(datas):
        if data['title'] is not None and data['content'] is not None:
            title = str(data['title']).strip()
            yield TaggedDocument(sentence_segment(title), [data['_id']])


# Training model
def train_doc_model():
    corpus = prepare_doc_corpus()
    doc2vec = Doc2Vec(vector_size=200, min_count=2, window=5, workers=4, epochs=20)
    doc2vec.build_vocab(corpus)
    doc2vec.train(corpus, total_examples=doc2vec.corpus_count, epochs=doc2vec.epochs)
    doc2vec.save('doc2vec.model')


def insert_data_to_es():
    datas = db['netease_ent_news_detail'].find({"create_time": {"$ne": None}}).sort('create_time', pymongo.ASCENDING)
    print datas.count()
    doc2vec = Doc2Vec.load('doc2vec.model')
    for data in datas:
        if data['title'] is not None and data['content'] is not None:
            sentence = str(data['title']).strip()
            title_dv = doc2vec.infer_vector(sentence_segment(sentence)).tolist()
            body = {"id": data['_id'], "title": data['title'], "title_dv": title_dv}
            es_result = es.create(index="article_v1", doc_type="_doc",
                id=data['_id'], body=body, ignore=[400, 409])
            print es_result


# The cosine Similarity function calculates the dense_vector similarity between a given document and a document in an index library
def search_es_dense_vertor_1(sentence):
    doc2vec = Doc2Vec.load('doc2vec.model')
    query_vector = doc2vec.infer_vector(sentence_segment(sentence)).tolist()
    body = {
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "cosineSimilarity(params.queryVector, doc['title_dv']) + 1",
                    "params": {
                        "queryVector": query_vector
                    }
                }
            }
        },
        "from": 0,
        "size": 5
    }
    result = es.search(index="article_v1", body=body)
    hits = result['hits']['hits']
    for hit in hits:
        source = hit['_source']
        for key, value in source.items():
            print '%s %s' % (key, value)
        print '----------'


# The dotProduct function calculates the distance between the dot product of a given document and an index library document
def search_es_dense_vertor_2(sentence):
    doc2vec = Doc2Vec.load('doc2vec.model')
    query_vector = doc2vec.infer_vector(sentence_segment(sentence)).tolist()
    body = {
        "query": {
            "script_score": {
                "query": {
                    "match_all": {}
                },
                "script": {
                    "source": "dotProduct(params.queryVector, doc['title_dv']) + 1",
                    "params": {
                        "queryVector": query_vector
                    }
                }
            }
        },
        "from": 0,
        "size": 5
    }
    result = es.search(index="article_v1", body=body)
    hits = result['hits']['hits']
    for hit in hits:
        source = hit['_source']
        for key, value in source.items():
            print '%s %s' % (key, value)
        print '----------'

 

 

Keywords: ElasticSearch JSON curl Database

Added by lssjg on Wed, 02 Oct 2019 11:10:14 +0300