PHP前端开发

实施相似性搜索算法

百变鹏仔 5个月前 (01-15) #Python

文章标签相似性

获取数据

import pandas as pddescripciones = [        'all users must reset passwords every 90 days.',        'passwords need to be reset by all users every 90 days.',        'admin access should be restricted.',        'passwords must change for users every 90 days.',        'passwords must change for users every 80 days.'    ]# cargar el datasetdata = pd.dataframe({    'rule_id': range(1, len(descripciones) + 1),    'description': descripciones})

词汇相似度

from sklearn.feature_extraction.text import tfidfvectorizerfrom sklearn.metrics.pairwise import cosine_similarity!# vectorización de las descripciones con tf-idfvectorizer = tfidfvectorizer().fit_transform(data['description'])# calcular la matriz de similitud de cosenocosine_sim_matrix = cosine_similarity(vectorizer)# crear un diccionario para almacenar las relaciones sin duplicadosdef find_related_rules(matrix, rule_ids, threshold=0.8):    related_rules = {}    seen_pairs = set()  # para evitar duplicados de la forma (a, b) = (b, a)    for i in range(len(matrix)):        related = []        for j in range(i + 1, len(matrix)):  # j comienza en i + 1 para evitar duplicados            if matrix[i, j] >= threshold:                pair = (rule_ids[i], rule_ids[j])                if pair not in seen_pairs:                    seen_pairs.add(pair)                    related.append((rule_ids[j], round(matrix[i, j], 2)))        if related:            related_rules[rule_ids[i]] = related    return related_rules# aplicar la función para encontrar reglas relacionadasrelated_rules = find_related_rules(cosine_sim_matrix, data['rule_id'].tolist(), threshold=0.8)# mostrar las reglas relacionadasprint("reglas relacionadas por similitud:")for rule, relations in related_rules.items():    print(f"rule {rule} es similar a:")    for related_rule, score in relations:        print(f"  - rule {related_rule} con similitud de {score}")

语义相似度

!pip install sentence-transformersfrom sentence_transformers import SentenceTransformer, util# Load the pre-trained model for generating embeddingsmodel = SentenceTransformer('all-MiniLM-L6-v2')# Generate sentence embeddings for each rule descriptionembeddings = model.encode(data['Description'], convert_to_tensor=True)# Compute the semantic similarity matrixcosine_sim_matrix = util.cos_sim(embeddings, embeddings).cpu().numpy()# Function to find related rules based on semantic similaritydef find_related_rules(matrix, rule_ids, threshold=0.8):    related_rules = {}    seen_pairs = set()  # To avoid duplicates of the form (A, B) = (B, A)    for i in range(len(matrix)):        related = []        for j in range(i + 1, len(matrix)):  # Only consider upper triangular matrix            if matrix[i, j] >= threshold:                pair = (rule_ids[i], rule_ids[j])                if pair not in seen_pairs:                    seen_pairs.add(pair)                    related.append((rule_ids[j], round(matrix[i, j], 2)))        if related:            related_rules[rule_ids[i]] = related    return related_rules# Apply the function to find related rulesrelated_rules = find_related_rules(cosine_sim_matrix, data['Rule_ID'].tolist(), threshold=0.8)# Display the related rulesprint("Reglas relacionadas por similitud semántica:")for rule, relations in related_rules.items():    print(f"Rule {rule} es similar a:")    for related_rule, score in relations:        print(f"  - Rule {related_rule} con similitud de {score}")

文章推荐

本站为非盈利性网站,不接受任何赞助和广告。

特别声明：本网站尊重并保护知识产权，根据《信息网络传播权保护条例》，如果我们转载的作品侵犯了您的权利,请在一个月内通知我们，我们会及时删除。 | 举报邮箱：344225443@qq.com

互联网ICP备案：陕ICP备2023000799号 Copyright 2023-2024 sharedbk.com All rights reserved | (c)2008-2024 共享博客