from sentence_transformers import SentenceTransformer from sentence_transformers.util import cos_sim
model = SentenceTransformer("tomaarsen/mpnet-base-nli-matryoshka")
matryoshka_dim = 64 embeddings = model.encode( [ "The weather is so nice!", "It's so sunny outside!", "He drove to the stadium.", ] ) embeddings = embeddings[..., :matryoshka_dim] # Shrink the embedding dimensions print(embeddings.shape) # => (3, 64)
# Similarity of the first sentence to the other two: similarities = cos_sim(embeddings[0], embeddings[1:]) print(similarities) # => tensor([[0.8910, 0.1337]])
模型链接: tomaarsen/mpnet-base-nli-matryoshka
请随意尝试使用不同的 matryoshka_dim 值,并观察这对相似度的影响。你可以通过在本地运行这段代码,在云端运行 (例如使用 Google Colab),或者查看 演示 来进行实验。
参考文献:
SentenceTransformer
SentenceTransformer.encode
util.cos_sim
Matryoshka Embeddings - 推理
点击这里查看如何使用 Nomic v1.5 Matryoshka 模型
from sentence_transformers import SentenceTransformer from sentence_transformers.util import cos_sim import torch.nn.functional as F
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)
matryoshka_dim = 64 embeddings = model.encode( [ "search_query: What is TSNE?", "search_document: t-distributed stochastic neighbor embedding (t-SNE) is a statistical method for visualizing high-dimensional data by giving each datapoint a location in a two or three-dimensional map.", "search_document: Amelia Mary Earhart was an American aviation pioneer and writer.", ], convert_to_tensor=True, ) # The Nomic team uses a custom architecture, making them recommend Layer Normalization before truncation embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) embeddings[..., :matryoshka_dim] # Shrink the embedding dimensions
Kusupati, A., Bhatt, G., Rege, A., Wallingford, M., Sinha, A., Ramanujan, V., … & Farhadi, A. (2022). Matryoshka representation learning. Advances in Neural Information Processing Systems, 35, 30233-30249. https://arxiv.org/abs/2205.13147