TF-IDF
import jieba
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
sentences = [
"今天天气真好,阳光明媚。",
"关键字匹配是一种常见的文本处理任务。",
"计算机不认识人类语言,要转成词向量。",
"富强、民主、文明、和谐、自由、平等、公正、法治、爱国、敬业、诚信、友善。",
"中文分词工具对文本处理很有帮助。",
]
query_sentence = "关键字匹配和文本处理任务"
data = {"Sentence": sentences}
df = pd.DataFrame(data)
def preprocess(text):
words = jieba.lcut(text)
return " ".join(words)
df["Preprocessed_Sentence"] = df["Sentence"].apply(preprocess)
query_sentence = preprocess(query_sentence)
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(list(df["Preprocessed_Sentence"]) + [query_sentence])
similarities = cosine_similarity(tfidf_matrix)
query_similarity = similarities[-1, :-1]
n = 10
top_indices = query_similarity.argsort()[-n:][::-1]
similar_sentences = df.loc[top_indices, "Sentence"].tolist()
similarity_scores = [query_similarity[i] for i in top_indices]
result_data = {"Similar_Sentence": similar_sentences, "Similarity_Score": similarity_scores}
result_df = pd.DataFrame(result_data)
print("查询句子:", query_sentence)
print("\n相似度最高的句子:")
result_df

Spacy
import warnings
warnings.filterwarnings("ignore")
import spacy
import pandas as pd
nlp = spacy.load("zh_core_web_sm")
data = {
"Sentences": [
"今天天气真好,阳光明媚。",
"关键字匹配是一种常见的文本处理任务。",
"计算机不认识人类语言,要转成词向量。",
"富强、民主、文明、和谐、自由、平等、公正、法治、爱国、敬业、诚信、友善。",
"中文分词工具对文本处理很有帮助。",
]
}
df = pd.DataFrame(data)
target_sentence = "关键字匹配和文本处理任务"
similarity_scores = []
for sentence in df["Sentences"]:
doc1 = nlp(target_sentence)
doc2 = nlp(sentence)
similarity = doc1.similarity(doc2)
similarity_scores.append(similarity)
df["Similarity Score"] = similarity_scores
n = 10
top_n_similar_sentences = df.sort_values(by="Similarity Score", ascending=False).head(n)
top_n_similar_sentences

Bert
import warnings
warnings.filterwarnings("ignore")
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import torch
from sklearn.metrics.pairwise import cosine_similarity
model_name = "model/bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
data = {
"Sentences": [
"今天天气真好,阳光明媚。",
"关键字匹配是一种常见的文本处理任务。",
"计算机不认识人类语言,要转成词向量。",
"富强、民主、文明、和谐、自由、平等、公正、法治、爱国、敬业、诚信、友善。",
"中文分词工具对文本处理很有帮助。",
]
}
df = pd.DataFrame(data)
target_sentence = "关键字匹配和文本处理任务"
similarity_scores = []
target_embedding = model(**tokenizer(target_sentence, return_tensors="pt", padding=True, truncation=True)).last_hidden_state.mean(dim=1)
for sentence in df["Sentences"]:
sentence_embedding = model(**tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)).last_hidden_state.mean(dim=1)
similarity = cosine_similarity(target_embedding.detach().numpy(), sentence_embedding.detach().numpy())[0][0]
similarity_scores.append(similarity)
df["Similarity Score"] = similarity_scores
n = 10
top_n_similar_sentences = df.sort_values(by="Similarity Score", ascending=False).head(n)
top_n_similar_sentences
