FP-growth演算法的python實現

FP-growth演算法是一種用於發現數據集中頻繁模式的有效方法。Apriori演算法在產生頻繁模式完全集前需要對資料庫進行多次掃描,同時產生大量的候選頻繁集,這就使Apriori演算法時間和空間複雜度較大。FP-growth演算法由Apriori演算法產生候選項集,然後掃描數據集來檢查它們是否頻繁。由於只對數據集掃描兩次,因此它比Apriori演算法速度要快,通常性能要好兩個數量級以上。

在FP-growth演算法中,數據集存儲在一個稱為FP(Frequent Pattern)樹的結構中。FP樹構建完成後,可以通過查找元素項的條件基以及構建條件FP樹來發現頻繁集。該過程不斷以更多的元素為條件重複進行,知道FP樹只包含一個元素為止。

下面僅以這個簡單的數據集為例子–實際上,既使在多達百萬條記錄的大數據集上,FP-growth演算法也能快速運行。

python代碼:

'''  FP-Growth FP means frequent pattern  the FP-Growth algorithm needs:  1. FP-tree (class treeNode)  2. header table (use dict)  This finds frequent itemsets similar to apriori but does not  find association rules.  @author: Peter  '''  def loadSimpDat():      simpDat = [['r', 'z', 'h', 'j', 'p'],                 ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],                 ['z','p','x'],                 ['r', 'x', 'n', 'o', 's'],                 ['y', 'r', 'x', 'z', 'q', 't', 'p'],                 ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]      return simpDat    class treeNode:      def __init__(self, nameValue, numOccur, parentNode):          self.name = nameValue          self.count = numOccur          self.nodeLink = None          self.parent = parentNode      #needs to be updated          self.children = {}        def inc(self, numOccur):          self.count += numOccur        def disp(self, ind=1):          print (('  '*ind, self.name, ' ', self.count))          for child in self.children.values():              child.disp(ind+1)      #def __lt__(self, other):#定義 "<"用於sorted()          #return self.count < other.count    def createTree(dataSet, minSup=1): #create FP-tree from dataset but don't mine      headerTable = {}      #go over dataSet twice      for trans in dataSet:#first pass counts frequency of occurance          for item in trans:              headerTable[item] = headerTable.get(item, 0) + dataSet[trans]        for k in list(headerTable.keys()):  #remove items not meeting minSup          if headerTable[k] < minSup:              headerTable.pop(k)      freqItemSet = set(headerTable.keys())        #print 'freqItemSet: ',freqItemSet      if len(freqItemSet) == 0: return None, None  #if no items meet min support -->get out      for k in headerTable:          headerTable[k] = [headerTable[k], None] #reformat headerTable to use Node link      #print 'headerTable: ',headerTable      retTree = treeNode('Null Set', 1, None) #create tree      for tranSet, count in dataSet.items():  #go through dataset 2nd time          localD = {}          for item in tranSet:  #put transaction items in order              if item in freqItemSet:                  localD[item] = headerTable[item][0]          if len(localD) > 0:              orderedItems = [v[0] for v in sorted(localD.items(), key=lambda p: p[1], reverse=True)]              updateTree(orderedItems, retTree, headerTable, count)#populate tree with ordered freq itemset      return retTree, headerTable #return tree and header table    def updateTree(items, inTree, headerTable, count):      if items[0] in inTree.children:#check if orderedItems[0] in retTree.children          inTree.children[items[0]].inc(count) #incrament count      else:   #add items[0] to inTree.children          inTree.children[items[0]] = treeNode(items[0], count, inTree)          if headerTable[items[0]][1] == None: #update header table              headerTable[items[0]][1] = inTree.children[items[0]]          else:              updateHeader(headerTable[items[0]][1], inTree.children[items[0]])      if len(items) > 1:#call updateTree() with remaining ordered items          updateTree(items[1::], inTree.children[items[0]], headerTable, count)    def updateHeader(nodeToTest, targetNode):   #this version does not use recursion      while (nodeToTest.nodeLink != None):    #Do not use recursion to traverse a linked list!          nodeToTest = nodeToTest.nodeLink      nodeToTest.nodeLink = targetNode    def ascendTree(leafNode, prefixPath): #ascends from leaf node to root      if leafNode.parent != None:          prefixPath.append(leafNode.name)          ascendTree(leafNode.parent, prefixPath)    def findPrefixPath(basePat, treeNode): #treeNode comes from header table      condPats = {}      while treeNode != None:          prefixPath = []          ascendTree(treeNode, prefixPath)          if len(prefixPath) > 1:              condPats[frozenset(prefixPath[1:])] = treeNode.count          treeNode = treeNode.nodeLink      return condPats    def mineTree(inTree, headerTable, minSup, preFix, freqItemList):      bigL = [k for k,v in sorted(headerTable.items(), key=lambda p: p[1][0])]#(sort header table)      for basePat in bigL:  #start from bottom of header table          newFreqSet = preFix.copy()          newFreqSet.add(basePat)          #print 'finalFrequent Item: ',newFreqSet    #append to set          freqItemList.append(newFreqSet)          condPattBases = findPrefixPath(basePat, headerTable[basePat][1])          #print 'condPattBases :',basePat, condPattBases          #2. construct cond FP-tree from cond. pattern base          myCondTree, myHead = createTree(condPattBases, minSup)          #print 'head from conditional tree: ', myHead          if myHead != None: #3. mine cond. FP-tree              #print 'conditional tree for: ',newFreqSet              #myCondTree.disp(1)              mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)#遞歸    def createInitSet(dataSet):      retDict = {}      for trans in dataSet:          retDict[frozenset(trans)] = 1      return retDict      minSup = 4  simpDat = loadSimpDat()  initSet = createInitSet(simpDat)  myFPtree, myHeaderTab = createTree(initSet, minSup)  myFreqList = []  if myFPtree is not None:      myFPtree.disp()      mineTree(myFPtree, myHeaderTab, minSup, set([]), myFreqList)  print("支持度為%d時,頻繁項數為%d:"%(minSup, len(myFreqList)))  print("頻繁項集為:")  for item in myFreqList:      print(item)