doclist 阅读(56) 评论(0)

对apriori关联关系算法研究了一段时间,网上能搜到的例子,大部分是python写的,数据集长得像下面这样:

[[I1,I2,I5],[I2,I4],[I2,I3],[I1,I2,I4],[I1,I3],[I2,I3],[I1,I3],[I1,I2,I3,I5],[I1,I2,I3]]

而实际的交易数据,一般存储到关系型数据库中,数据是按下面的样子保存:

TRAN_SEQ_NO,ITEM
1,I1
1,I2
1,I5
2,I2
2,I4
3,I2
3,I3
4,I1
4,I2
4,I4
5,I1
5,I3
6,I2
6,I3
7,I1
7,I3
8,I1
8,I2
8,I3
8,I5
9,I1
9,I2
9,I3

而且python的程序,写了好多循环,效率不高。

根据小票数据在数据库中存储的特点,并且apriori算法也不是特别复杂,因此想用plsql实现一下。

plsql实现的aprioir算法,对原算法做了裁剪,只计算2项集和两个商品之间的关联关系,3项集以上的忽略不计。

表结构创建

1.小票表(交易事物表)

create table CMX_APRIORI_TRANSACTION
(
  tran_seq_no NUMBER(20), --交易号
  item        VARCHAR2(25)  --商品编码
)

2.频繁项集C1

create table CMX_APRIORI_L1
(
  item    VARCHAR2(25),  --商品编码
  support NUMBER,          --支持度
  cnt     NUMBER              --交易次数
)

2.关联关系L2(结果表)

create table CMX_APRIORI_L2
(
  item_a   VARCHAR2(25),    --前件商品编码
  item_b   VARCHAR2(25),    --后件商品编码
  cnt      NUMBER,                 --交易次数
  support  NUMBER,             --支持度
  conf_a_b NUMBER(12,6),  --置信度
  lift_a_b NUMBER(12,6)      --提升度
)

 说明:

小票表插入的数据,必须提前进行处理。同一张小票商品要去重。

完整plsql代码:

CREATE OR REPLACE PACKAGE CMX_APRIORI_SQL IS

  /*-----------------------------------------------------------------------
  * PROCEDURE NAME      : CMX_APRIORI_SQL
  * COMMENTS            : 商品关联关系计算
  * CODED BY            : ONELANG 2019-1-27
  * CHANGED HISTORY     :
  -----------------------------------------------------------------------*/
  FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2)   RETURN BOOLEAN;
END CMX_APRIORI_SQL;
/
CREATE OR REPLACE PACKAGE BODY CMX_APRIORI_SQL IS

 /*-----------------------------------------------------------------------
  * PROCEDURE NAME      : INIT_TEST_DATA
  * COMMENTS            : 初始化测试数据
  * CODED BY            : ONELANG 2019-1-27
  * CHANGED HISTORY     :
  -----------------------------------------------------------------------*/
  
  FUNCTION INIT_TEST_DATA(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS
    L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.INIT_TEST_DATA';
  BEGIN
  
    EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_TRANSACTION';
    
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (1,'I1');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (1,'I2');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (1,'I5');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (2,'I2');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (2,'I4');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (3,'I2');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (3,'I3');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (4,'I1');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (4,'I2');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (4,'I4');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (5,'I1');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (5,'I3');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (6,'I2');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (6,'I3');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (7,'I1');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (7,'I3');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (8,'I1');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (8,'I2');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (8,'I3');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (8,'I5');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (9,'I1');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (9,'I2');
    INSERT INTO CMX_APRIORI_TRANSACTION (TRAN_SEQ_NO,ITEM)
    VALUES (9,'I3');
    COMMIT;
    RETURN TRUE;
  
  EXCEPTION
    WHEN OTHERS THEN
      O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
      RETURN FALSE;
  END;

 /*-----------------------------------------------------------------------
  * PROCEDURE NAME      : GET_TEST_L1
  * COMMENTS            : 计算频繁项集L1
  * CODED BY            : ONELANG 2019-1-27
  * CHANGED HISTORY     :
  -----------------------------------------------------------------------*/
  
FUNCTION GET_TEST_L1(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS
    L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_TEST_L1';
    L_TOTAL NUMBER(8);
  BEGIN
    
    EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L1';
    
    SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL
    FROM CMX_APRIORI_TRANSACTION ;
             
    INSERT INTO CMX_APRIORI_L1
    SELECT ITEM,ROUND(COUNT(1) / L_TOTAL,6) SUPPORT,COUNT(1) CNT
      FROM CMX_APRIORI_TRANSACTION A
    GROUP BY ITEM
    HAVING COUNT(1) / L_TOTAL >= I_SUPPORT ;
    
    COMMIT;
      
    RETURN TRUE;
  
  EXCEPTION
    WHEN OTHERS THEN
      O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
    
      RETURN FALSE;
  END;
  
  /*-----------------------------------------------------------------------
  * PROCEDURE NAME      : GET_L2
  * COMMENTS            : 获取关联关系L2
  * CODED BY            : 
  * CHANGED HISTORY     :
  -----------------------------------------------------------------------*/
  FUNCTION GET_TEST_L2(O_ERROR_MESSAGE IN OUT VARCHAR2,I_SUPPORT IN NUMBER) RETURN BOOLEAN IS
    L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.GET_L2';
    L_TOTAL NUMBER(8);
  BEGIN
    
    EXECUTE IMMEDIATE 'TRUNCATE TABLE CMX_APRIORI_L2';
    
    SELECT COUNT(DISTINCT TRAN_SEQ_NO) INTO L_TOTAL
    FROM CMX_APRIORI_TRANSACTION ;
    
    --2项集支持度
    INSERT INTO CMX_APRIORI_L2(ITEM_A,ITEM_B,CNT,SUPPORT)
    SELECT ITEM_A,ITEM_B,COUNT(DISTINCT TRAN_SEQ_NO) CNT, ROUND(COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL,6) SUPPORT
      FROM (SELECT A.TRAN_SEQ_NO,
                   A.ITEM ITEM_A,
                   B.ITEM ITEM_B
            FROM CMX_APRIORI_TRANSACTION A,
                 CMX_APRIORI_TRANSACTION B
           WHERE A.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1)
             AND B.ITEM IN (SELECT ITEM FROM CMX_APRIORI_L1)
             AND A.TRAN_SEQ_NO = B.TRAN_SEQ_NO
             AND A.ITEM > B.ITEM             
          )
    GROUP BY ITEM_A,ITEM_B
    HAVING  COUNT(DISTINCT TRAN_SEQ_NO) / L_TOTAL >= I_SUPPORT;
    
    --置信度
    UPDATE CMX_APRIORI_L2 L2
       SET CONF_A_B = (SELECT L2.SUPPORT /  L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_A = L1.ITEM);
    
    --提升度
    UPDATE CMX_APRIORI_L2 L2
       SET LIFT_A_B = (SELECT L2.CONF_A_B /  L1.SUPPORT FROM CMX_APRIORI_L1 L1 WHERE L2.ITEM_B = L1.ITEM);
    
    COMMIT;
      
    RETURN TRUE;
  
  EXCEPTION
    WHEN OTHERS THEN
      O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
    
      RETURN FALSE;
  END;  
  
  FUNCTION TEST(O_ERROR_MESSAGE IN OUT VARCHAR2) RETURN BOOLEAN IS
    L_PROGRAM VARCHAR2(100) := 'CMX_APRIORI_SQL.TEST';
    L_MIN_SUPP NUMBER(9,5);
  BEGIN
    
    L_MIN_SUPP := 0.1;
    
    IF INIT_TEST_DATA(O_ERROR_MESSAGE) = FALSE THEN
      RETURN FALSE;
    END IF;
    
    IF GET_TEST_L1(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN
      RETURN FALSE;
    END IF;
    
    IF GET_TEST_L2(O_ERROR_MESSAGE,L_MIN_SUPP) = FALSE THEN
      RETURN FALSE;
    END IF;
  
    RETURN TRUE;
  
  EXCEPTION
    WHEN OTHERS THEN
      O_ERROR_MESSAGE := SQLERRM||TO_CHAR(SQLCODE);
      RETURN FALSE;
  END;
  
END CMX_APRIORI_SQL;
/

运行:

 

declare  
  result boolean;
begin
  -- Call the function
  result := cmx_apriori_sql.test(o_error_message => :o_error_message);
  -- Convert false/true/null to 0/1/null 
  :result := sys.diutil.bool_to_int(result);
end;

运行结果:

select * from cmx_apriori_l2

 ITEM_A  ITEM_B    CNT    SUPPORT    CONF_A_B    LIFT_A_B
1    I2    I1    4    0.444444    0.571428    0.857142
2    I4    I2    2    0.222222    1.000000    1.285714
3    I5    I3    1    0.111111    0.500000    0.750000
4    I3    I2    4    0.444444    0.666666    0.857142
5    I5    I1    2    0.222222    1.000000    1.499999
6    I3    I1    4    0.444444    0.666666    0.999999
7    I4    I1    1    0.111111    0.500000    0.750000
8    I5    I2    2    0.222222    1.000000    1.285714

 

用一家门店,一年的销售数据计算一下就会发现,尿布->啤酒根本没有关联关系。意外+惊喜。