일 | 월 | 화 | 수 | 목 | 금 | 토 |
---|---|---|---|---|---|---|
1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 16 | 17 | 18 | 19 | 20 | 21 |
22 | 23 | 24 | 25 | 26 | 27 | 28 |
29 | 30 | 31 |
- Cache
- java
- redis
- file download
- Aggregation
- KNN
- vavr
- api cache
- Selenium
- NORI
- ann
- Elasticsearch
- java crawler
- JPA
- TSLA
- elasticsearch cache
- API
- Docker
- aqqle
- aggs
- request cache
- 양자컴퓨터
- 테슬라
- Elastic
- IONQ
- 아이온큐
- mysql
- Query
- Analyzer
- dbeaver
- Today
- Total
아빠는 개발자
[tensorflow 2]Universal-sentence-encoder-multilingual-large 본문
[tensorflow 2]Universal-sentence-encoder-multilingual-large
father6019 2024. 8. 19. 21:49A,B 테스트에서 승리한 B모듈에 강한 상대가 나타났다 라지형님
https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3
공정한 심사를 위해 일단 둘다 지우자
대결 항목
- 상품명으로 테스트
- 상품명 + 카테고리 테스트
- 상품명 + 카테고리 토큰 테스트
put_products.py
# -*- coding: utf-8 -*-
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import tensorflow_hub as hub
import tensorflow_text
import kss, numpy
##### INDEXING #####
def index_data():
print("Creating the '" + INDEX_NAME_A + "' index.")
print("Creating the '" + INDEX_NAME_B + "' index.")
client.indices.delete(index=INDEX_NAME_A, ignore=[404])
client.indices.delete(index=INDEX_NAME_A, ignore=[404])
with open(INDEX_FILE) as index_file:
source = index_file.read().strip()
client.indices.create(index=INDEX_NAME_A, body=source)
client.indices.create(index=INDEX_NAME_B, body=source)
count = 0
docs = []
with open(DATA_FILE) as data_file:
for line in data_file:
line = line.strip()
json_data = json.loads(line)
docs.append(json_data)
count += 1
if count % BATCH_SIZE == 0:
index_batch_a(docs)
index_batch_b(docs)
docs = []
print("Indexed {} documents.".format(count))
if docs:
index_batch_a(docs)
index_batch_b(docs)
print("Indexed {} documents.".format(count))
client.indices.refresh(index=INDEX_NAME_A)
client.indices.refresh(index=INDEX_NAME_B)
print("Done indexing.")
def paragraph_index(paragraph):
# 문장단위 분리
avg_paragraph_vec = numpy.zeros((1, 512))
sent_count = 0
for sent in kss.split_sentences(paragraph[0:100]):
# 문장을 embed 하기
# vector들을 평균으로 더해주기
avg_paragraph_vec += embed_text([sent])
sent_count += 1
avg_paragraph_vec /= sent_count
return avg_paragraph_vec.ravel(order='C')
def index_batch_a(docs):
name = [doc["name"] for doc in docs]
name_vectors = embed_text_a(name)
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = INDEX_NAME_A
request["name_vector"] = name_vectors[i]
requests.append(request)
bulk(client, requests)
def index_batch_b(docs):
name = [doc["name"] for doc in docs]
name_vectors = embed_text_b(name)
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = INDEX_NAME_B
request["name_vector"] = name_vectors[i]
requests.append(request)
bulk(client, requests)
##### EMBEDDING #####
def embed_text_a(input):
vectors = embed_a(input)
return [vector.numpy().tolist() for vector in vectors]
def embed_text_b(input):
vectors = embed_b(input)
return [vector.numpy().tolist() for vector in vectors]
##### MAIN SCRIPT #####
if __name__ == '__main__':
INDEX_NAME_A = "products_a"
INDEX_NAME_B = "products_b"
INDEX_FILE = "./data/products/index.json"
DATA_FILE = "./data/products/products.json"
BATCH_SIZE = 100
SEARCH_SIZE = 3
print("Downloading pre-trained embeddings from tensorflow hub...")
embed_a = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
embed_b = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
client = Elasticsearch(http_auth=('elastic', 'datalake'))
index_data()
print("Done.")
대결상대 바꿔주고 실행
python src/put_products.py
get_products_ab.py
# -*- coding: utf-8 -*-
import time
import json
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import tensorflow_hub as hub
import tensorflow_text
import kss, numpy
##### SEARCHING #####
def run_query_loop():
while True:
try:
handle_query()
except KeyboardInterrupt:
return
def handle_query():
query = input("Enter query: ")
embedding_start = time.time()
query_vector_a = embed_text_a([query])[0]
query_vector_b = embed_text_b([query])[0]
embedding_time = time.time() - embedding_start
script_query_a = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, doc['name_vector']) + 1.0",
"params": {"query_vector": query_vector_a}
}
}
}
script_query_b = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, doc['name_vector']) + 1.0",
"params": {"query_vector": query_vector_b}
}
}
}
search_start = time.time()
response_a = client.search(
index=INDEX_NAME_A,
body={
"size": SEARCH_SIZE,
"query": script_query_a,
"_source": {"includes": ["name", "price"]}
}
)
response_b = client.search(
index=INDEX_NAME_B,
body={
"size": SEARCH_SIZE,
"query": script_query_b,
"_source": {"includes": ["name", "price"]}
}
)
search_time = time.time() - search_start
print("검색어 :" , query)
print()
print("CASE A : ")
for hit in response_a["hits"]["hits"]:
print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
print(hit["_source"]["name"])
print()
print()
print("CASE B : ")
for hit in response_b["hits"]["hits"]:
print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
print(hit["_source"]["name"])
print()
##### INDEXING #####
def index_data():
print("Creating the '" + INDEX_NAME_A + "' index.")
print("Creating the '" + INDEX_NAME_B + "' index.")
client.indices.delete(index=INDEX_NAME_A, ignore=[404])
client.indices.delete(index=INDEX_NAME_A, ignore=[404])
with open(INDEX_FILE) as index_file:
source = index_file.read().strip()
client.indices.create(index=INDEX_NAME_A, body=source)
client.indices.create(index=INDEX_NAME_B, body=source)
count = 0
docs = []
with open(DATA_FILE) as data_file:
for line in data_file:
line = line.strip()
json_data = json.loads(line)
docs.append(json_data)
count += 1
if count % BATCH_SIZE == 0:
index_batch_a(docs)
index_batch_b(docs)
docs = []
print("Indexed {} documents.".format(count))
if docs:
index_batch_a(docs)
index_batch_b(docs)
print("Indexed {} documents.".format(count))
client.indices.refresh(index=INDEX_NAME_A)
client.indices.refresh(index=INDEX_NAME_B)
print("Done indexing.")
def paragraph_index(paragraph):
# 문장단위 분리
avg_paragraph_vec = numpy.zeros((1, 512))
sent_count = 0
for sent in kss.split_sentences(paragraph[0:100]):
# 문장을 embed 하기
# vector들을 평균으로 더해주기
avg_paragraph_vec += embed_text([sent])
sent_count += 1
avg_paragraph_vec /= sent_count
return avg_paragraph_vec.ravel(order='C')
def index_batch_a(docs):
name = [doc["name"] for doc in docs]
name_vectors = embed_text_a(name)
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = INDEX_NAME_A
request["name_vector"] = name_vectors[i]
requests.append(request)
bulk(client, requests)
def index_batch_b(docs):
name = [doc["name"] for doc in docs]
name_vectors = embed_text_b(name)
requests = []
for i, doc in enumerate(docs):
request = doc
request["_op_type"] = "index"
request["_index"] = INDEX_NAME_B
request["name_vector"] = name_vectors[i]
requests.append(request)
bulk(client, requests)
##### EMBEDDING #####
def embed_text_a(input):
vectors = embed_a(input)
return [vector.numpy().tolist() for vector in vectors]
def embed_text_b(input):
vectors = embed_b(input)
return [vector.numpy().tolist() for vector in vectors]
##### MAIN SCRIPT #####
if __name__ == '__main__':
INDEX_NAME_A = "products_a"
INDEX_NAME_B = "products_b"
INDEX_FILE = "./data/products/index.json"
DATA_FILE = "./data/products/products.json"
BATCH_SIZE = 100
SEARCH_SIZE = 3
print("Downloading pre-trained embeddings from tensorflow hub...")
embed_a = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")
embed_b = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")
client = Elasticsearch(http_auth=('elastic', 'datalake'))
run_query_loop()
print("Done.")
검색어 : 아이폰 케이스
CASE A :
id: CcErUn4BAf0FcTmqBIOi, score: 1.7229252
케이맥스 아이폰 11 프로용 클리어핏 케이스
id: X8ErUn4BAf0FcTmqAoJ1, score: 1.701367
뷰씨 아이폰 7플러스용 슬림 케이스
id: QsErUn4BAf0FcTmqAoJ1, score: 1.688775
베리어 아이폰 X용 에어백 케이스
CASE B :
id: NsErUn4BAf0FcTmqBYMk, score: 1.7272676
나하로 아이폰 11용 리얼하이브리드 케이스
id: 7cErUn4BAf0FcTmqAIFz, score: 1.7031344
araree 아이폰 11 프로용 마하 케이스
id: C8ErUn4BAf0FcTmqAIJz, score: 1.7014272
araree 아이폰 11용 타이포스킨 케이스
막상막하네..
검색어 : 나이키 후드
CASE A :
id: GcEqUn4BAf0FcTmqzXRZ, score: 1.7612474
나이키 프렌치 테리 후드 DD4667-010
id: u8EqUn4BAf0FcTmq0nWt, score: 1.7200605
나이키 클럽 기모 후드티 835585-010
id: 5cEqUn4BAf0FcTmq0HQO, score: 1.6220633
나이키 에어로레이어 재킷 CU5390-010
CASE B :
id: GcEqUn4BAf0FcTmq03Zp, score: 1.7441814
나이키 온 덱 CU3958-002
id: s8EqUn4BAf0FcTmqznQF, score: 1.741488
나이키 BV6887-010
id: fcEqUn4BAf0FcTmqznQE, score: 1.7042294
나이키 프렌치 테리 후드 DD4667-010
1차전 - A 모듈의 승리
'Python > Text embeddings' 카테고리의 다른 글
[tensorflow 2] Text embedding A/B TEST - 2 (0) | 2024.08.19 |
---|---|
[tensorflow 2] Text embedding A/B TEST - 1 (1) | 2024.08.19 |