아빠는 개발자

[tensorflow 2]Universal-sentence-encoder-multilingual-large 본문

Python/Text embeddings

[tensorflow 2]Universal-sentence-encoder-multilingual-large

father6019 2024. 8. 19. 21:49
728x90
반응형

A,B 테스트에서 승리한 B모듈에 강한 상대가 나타났다 라지형님

https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3

 

Google | universal-sentence-encoder | Kaggle

Encoder of greater-than-word length text trained on a variety of data.

www.kaggle.com

 

 

공정한 심사를 위해 일단 둘다 지우자

대결 항목

  • 상품명으로 테스트
  • 상품명 + 카테고리 테스트
  • 상품명 + 카테고리 토큰 테스트

 

put_products.py

# -*- coding: utf-8 -*-

import json

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import tensorflow_hub as hub
import tensorflow_text
import kss, numpy


##### INDEXING #####

def index_data():
    print("Creating the '" + INDEX_NAME_A + "' index.")
    print("Creating the '" + INDEX_NAME_B + "' index.")
    client.indices.delete(index=INDEX_NAME_A, ignore=[404])
    client.indices.delete(index=INDEX_NAME_A, ignore=[404])

    with open(INDEX_FILE) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME_A, body=source)
        client.indices.create(index=INDEX_NAME_B, body=source)

    count = 0
    docs = []

    with open(DATA_FILE) as data_file:
        for line in data_file:
            line = line.strip()

            json_data = json.loads(line)

            docs.append(json_data)
            count += 1

            if count % BATCH_SIZE == 0:
                index_batch_a(docs)
                index_batch_b(docs)
                docs = []
                print("Indexed {} documents.".format(count))

        if docs:
            index_batch_a(docs)
            index_batch_b(docs)
            print("Indexed {} documents.".format(count))

    client.indices.refresh(index=INDEX_NAME_A)
    client.indices.refresh(index=INDEX_NAME_B)
    print("Done indexing.")


def paragraph_index(paragraph):
    # 문장단위 분리
    avg_paragraph_vec = numpy.zeros((1, 512))
    sent_count = 0
    for sent in kss.split_sentences(paragraph[0:100]):
        # 문장을 embed 하기
        # vector들을 평균으로 더해주기
        avg_paragraph_vec += embed_text([sent])
        sent_count += 1
    avg_paragraph_vec /= sent_count
    return avg_paragraph_vec.ravel(order='C')


def index_batch_a(docs):
    name = [doc["name"] for doc in docs]
    name_vectors = embed_text_a(name)
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME_A
        request["name_vector"] = name_vectors[i]
        requests.append(request)
    bulk(client, requests)

def index_batch_b(docs):
    name = [doc["name"] for doc in docs]
    name_vectors = embed_text_b(name)
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME_B
        request["name_vector"] = name_vectors[i]
        requests.append(request)
    bulk(client, requests)
##### EMBEDDING #####

def embed_text_a(input):
    vectors = embed_a(input)
    return [vector.numpy().tolist() for vector in vectors]

def embed_text_b(input):
    vectors = embed_b(input)
    return [vector.numpy().tolist() for vector in vectors]

##### MAIN SCRIPT #####

if __name__ == '__main__':
    INDEX_NAME_A = "products_a"
    INDEX_NAME_B = "products_b"
    INDEX_FILE = "./data/products/index.json"

    DATA_FILE = "./data/products/products.json"
    BATCH_SIZE = 100

    SEARCH_SIZE = 3

    print("Downloading pre-trained embeddings from tensorflow hub...")
    embed_a = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
    embed_b = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

    client = Elasticsearch(http_auth=('elastic', 'datalake'))

    index_data()

    print("Done.")

 

대결상대 바꿔주고 실행

python src/put_products.py

get_products_ab.py

# -*- coding: utf-8 -*-
import time
import json

from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

import tensorflow_hub as hub
import tensorflow_text
import kss, numpy


##### SEARCHING #####

def run_query_loop():
    while True:
        try:
            handle_query()
        except KeyboardInterrupt:
            return

def handle_query():
    query = input("Enter query: ")

    embedding_start = time.time()
    query_vector_a = embed_text_a([query])[0]
    query_vector_b = embed_text_b([query])[0]
    embedding_time = time.time() - embedding_start

    script_query_a = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, doc['name_vector']) + 1.0",
                "params": {"query_vector": query_vector_a}
            }
        }
    }

    script_query_b = {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, doc['name_vector']) + 1.0",
                "params": {"query_vector": query_vector_b}
            }
        }
    }

    search_start = time.time()
    response_a = client.search(
        index=INDEX_NAME_A,
        body={
            "size": SEARCH_SIZE,
            "query": script_query_a,
            "_source": {"includes": ["name", "price"]}
        }
    )

    response_b = client.search(
        index=INDEX_NAME_B,
        body={
            "size": SEARCH_SIZE,
            "query": script_query_b,
            "_source": {"includes": ["name", "price"]}
        }
    )
    search_time = time.time() - search_start


    print("검색어 :" , query)
    print()
    print("CASE A : ")
    for hit in response_a["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"]["name"])
        print()
    print()
    print("CASE B : ")
    for hit in response_b["hits"]["hits"]:
        print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
        print(hit["_source"]["name"])
        print()

##### INDEXING #####

def index_data():
    print("Creating the '" + INDEX_NAME_A + "' index.")
    print("Creating the '" + INDEX_NAME_B + "' index.")
    client.indices.delete(index=INDEX_NAME_A, ignore=[404])
    client.indices.delete(index=INDEX_NAME_A, ignore=[404])

    with open(INDEX_FILE) as index_file:
        source = index_file.read().strip()
        client.indices.create(index=INDEX_NAME_A, body=source)
        client.indices.create(index=INDEX_NAME_B, body=source)

    count = 0
    docs = []

    with open(DATA_FILE) as data_file:
        for line in data_file:
            line = line.strip()

            json_data = json.loads(line)

            docs.append(json_data)
            count += 1

            if count % BATCH_SIZE == 0:
                index_batch_a(docs)
                index_batch_b(docs)
                docs = []
                print("Indexed {} documents.".format(count))

        if docs:
            index_batch_a(docs)
            index_batch_b(docs)
            print("Indexed {} documents.".format(count))

    client.indices.refresh(index=INDEX_NAME_A)
    client.indices.refresh(index=INDEX_NAME_B)
    print("Done indexing.")


def paragraph_index(paragraph):
    # 문장단위 분리
    avg_paragraph_vec = numpy.zeros((1, 512))
    sent_count = 0
    for sent in kss.split_sentences(paragraph[0:100]):
        # 문장을 embed 하기
        # vector들을 평균으로 더해주기
        avg_paragraph_vec += embed_text([sent])
        sent_count += 1
    avg_paragraph_vec /= sent_count
    return avg_paragraph_vec.ravel(order='C')


def index_batch_a(docs):
    name = [doc["name"] for doc in docs]
    name_vectors = embed_text_a(name)
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME_A
        request["name_vector"] = name_vectors[i]
        requests.append(request)
    bulk(client, requests)

def index_batch_b(docs):
    name = [doc["name"] for doc in docs]
    name_vectors = embed_text_b(name)
    requests = []
    for i, doc in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME_B
        request["name_vector"] = name_vectors[i]
        requests.append(request)
    bulk(client, requests)
##### EMBEDDING #####

def embed_text_a(input):
    vectors = embed_a(input)
    return [vector.numpy().tolist() for vector in vectors]

def embed_text_b(input):
    vectors = embed_b(input)
    return [vector.numpy().tolist() for vector in vectors]

##### MAIN SCRIPT #####

if __name__ == '__main__':
    INDEX_NAME_A = "products_a"
    INDEX_NAME_B = "products_b"
    INDEX_FILE = "./data/products/index.json"

    DATA_FILE = "./data/products/products.json"
    BATCH_SIZE = 100

    SEARCH_SIZE = 3

    print("Downloading pre-trained embeddings from tensorflow hub...")
    embed_a = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
    embed_b = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

    client = Elasticsearch(http_auth=('elastic', 'datalake'))

    run_query_loop()
    print("Done.")

 

 

 

 

 

검색어 : 아이폰 케이스

CASE A :
id: CcErUn4BAf0FcTmqBIOi, score: 1.7229252
케이맥스 아이폰 11 프로용 클리어핏 케이스

id: X8ErUn4BAf0FcTmqAoJ1, score: 1.701367
뷰씨 아이폰 7플러스용 슬림 케이스

id: QsErUn4BAf0FcTmqAoJ1, score: 1.688775
베리어 아이폰 X용 에어백 케이스


CASE B :
id: NsErUn4BAf0FcTmqBYMk, score: 1.7272676
나하로 아이폰 11용 리얼하이브리드 케이스

id: 7cErUn4BAf0FcTmqAIFz, score: 1.7031344
araree 아이폰 11 프로용 마하 케이스

id: C8ErUn4BAf0FcTmqAIJz, score: 1.7014272
araree 아이폰 11용 타이포스킨 케이스

 

 

막상막하네..

 

 

검색어 : 나이키 후드

CASE A :
id: GcEqUn4BAf0FcTmqzXRZ, score: 1.7612474
나이키 프렌치 테리 후드 DD4667-010

id: u8EqUn4BAf0FcTmq0nWt, score: 1.7200605
나이키 클럽 기모 후드티 835585-010

id: 5cEqUn4BAf0FcTmq0HQO, score: 1.6220633
나이키 에어로레이어 재킷 CU5390-010


CASE B :
id: GcEqUn4BAf0FcTmq03Zp, score: 1.7441814
나이키 온 덱 CU3958-002

id: s8EqUn4BAf0FcTmqznQF, score: 1.741488
나이키 BV6887-010

id: fcEqUn4BAf0FcTmqznQE, score: 1.7042294
나이키 프렌치 테리 후드 DD4667-010

 

1차전  - A 모듈의 승리

728x90
반응형

'Python > Text embeddings' 카테고리의 다른 글

[tensorflow 2] Text embedding A/B TEST - 2  (0) 2024.08.19
[tensorflow 2] Text embedding A/B TEST - 1  (1) 2024.08.19