apriori算法 python实现

from typing import *

from typing import List, Tuple
from itertools import combinations


def loadDataSet():
    return [{1, 2, 4, 5}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]


def loadCharDataSet():
    return [{ord(x) for x in item} for item in
            [
                list("abde"),
                list("bcd"),
                list("abde"),
                list("acde"),
                list("bcde"),
                list("bde"),
                list("cd"),
                list("abc"),
                list("ade"),
                list("bd"),
            ]
            ]


def get_k_item(data: List[Set[int]], k: int) -> List[Set[int]]:
    """
    找出所有的k项集
    """
    single_element = set([item for items in data for item in items])
    return [
        set(item) for item in combinations(single_element, k)
    ]


def get_one_support_rate(dataset: List[Set[int]], item: Set[int]):
    return sum([1 for dataset_item in dataset if len(item & dataset_item) == len(item)]) / len(dataset)


def get_support_rate(dataset: List[Set[int]], data: List[Set[int]]) -> List[Tuple[tuple, float]]:
    """
    计算给定k项集的支持度
    """
    return [
        (
            tuple(item),
            get_one_support_rate(dataset, item)
        )
        for item in data
    ]


def get_filter_items(data: List[Tuple[tuple, float]], rate: float) -> List[Set[int]]:
    """
    通过支持度,筛选给定的k项集
    """
    return [set(item[0]) for item in data if item[1] >= rate]

def get_item_confidence(items_after):
    result = []
    for max_feq_item in items_after:
        n = len(max_feq_item)
        all_subsets = [list(combinations(max_feq_item, k)) for k in range(1, n)]
        all_subsets = list(set(item) for subset in all_subsets for item in subset)
        li = []
        for subset in all_subsets:
            complement = set(max_feq_item) - subset
            li.append([subset, complement])
        for item in li:
            rate = get_one_support_rate(dataset, item[0] | item[1]) / get_one_support_rate(dataset, item[0])
            item.append(rate)
        result.extend(li)
    return result

def get_filter_relation(data: List[List[Union[dict, dict, float]]], rate: float, is_log=True) -> List[
    List[Union[dict, dict]]]:
    """
    通过支持度,筛选给定的k项集
    """
    relations = [[item[0], item[1]] for item in data if item[2] >= rate]
    if is_log:
        for relation in relations:
            print(f"{relation[0]} -> {relation[1]}")
    return relations
dataset = loadDataSet()
print(f"dataset: {dataset}")
k = 0
MIN_SUPPORT = 0.5
MIN_CONFIDENCE = 0.8
print(f"最小支持度: {MIN_SUPPORT}")
print(f"最小置信度: {MIN_CONFIDENCE}")

items_after = dataset  # 初始化为数据集
while True:
    k += 1
    item_or_not = get_k_item(items_after, k)
    if len(item_or_not) == 0:
        print("结束")
        break
    items = item_or_not
    print(f"候选{k}项集: {items}")
    items_pre = get_support_rate(dataset, items)
    print(f"候选{k}项集的支持度: {items_pre}")
    items_after_or_not = get_filter_items(items_pre, rate=MIN_SUPPORT)
    if len(items_after_or_not) == 0:
        print("结束")
        break
    items_after = items_after_or_not
    print(f"频繁{k}项集: {items_after}")
    relations = get_item_confidence(items_after)
    print(f"关联规则及置信度: {relations}")
    print(f"筛选后的关联规则: ")
    get_filter_relation(relations, rate=MIN_CONFIDENCE, is_log=True)

output:

dataset: [{1, 2, 4, 5}, {2, 3, 5}, {1, 2, 3, 5}, {2, 5}]
最小支持度: 0.5
最小置信度: 0.8
候选1项集: [{1}, {2}, {3}, {4}, {5}]
候选1项集的支持度: [((1,), 0.5), ((2,), 1.0), ((3,), 0.5), ((4,), 0.25), ((5,), 1.0)]
频繁1项集: [{1}, {2}, {3}, {5}]
关联规则及置信度: []
筛选后的关联规则: 
候选2项集: [{1, 2}, {1, 3}, {1, 5}, {2, 3}, {2, 5}, {3, 5}]
候选2项集的支持度: [((1, 2), 0.5), ((1, 3), 0.25), ((1, 5), 0.5), ((2, 3), 0.5), ((2, 5), 1.0), ((3, 5), 0.5)]
频繁2项集: [{1, 2}, {1, 5}, {2, 3}, {2, 5}, {3, 5}]
关联规则及置信度: [[{1}, {2}, 1.0], [{2}, {1}, 0.5], [{1}, {5}, 1.0], [{5}, {1}, 0.5], [{2}, {3}, 0.5], [{3}, {2}, 1.0], [{2}, {5}, 1.0], [{5}, {2}, 1.0], [{3}, {5}, 1.0], [{5}, {3}, 0.5]]
筛选后的关联规则: 
{1} -> {2}
{1} -> {5}
{3} -> {2}
{2} -> {5}
{5} -> {2}
{3} -> {5}
候选3项集: [{1, 2, 3}, {1, 2, 5}, {1, 3, 5}, {2, 3, 5}]
候选3项集的支持度: [((1, 2, 3), 0.25), ((1, 2, 5), 0.5), ((1, 3, 5), 0.25), ((2, 3, 5), 0.5)]
频繁3项集: [{1, 2, 5}, {2, 3, 5}]
关联规则及置信度: [[{1}, {2, 5}, 1.0], [{2}, {1, 5}, 0.5], [{5}, {1, 2}, 0.5], [{1, 2}, {5}, 1.0], [{1, 5}, {2}, 1.0], [{2, 5}, {1}, 0.5], [{2}, {3, 5}, 0.5], [{3}, {2, 5}, 1.0], [{5}, {2, 3}, 0.5], [{2, 3}, {5}, 1.0], [{2, 5}, {3}, 0.5], [{3, 5}, {2}, 1.0]]
筛选后的关联规则: 
{1} -> {2, 5}
{1, 2} -> {5}
{1, 5} -> {2}
{3} -> {2, 5}
{2, 3} -> {5}
{3, 5} -> {2}
候选4项集: [{1, 2, 3, 5}]
候选4项集的支持度: [((1, 2, 3, 5), 0.25)]
结束
posted @ 2023-02-16 12:09  aminor  阅读(102)  评论(0)    收藏  举报
/**/ /**/