Source code for fraudar.export.greedy

#
#  greedy.py
#
#  Copyright (c) 2016-2023 Junpei Kawamoto
#
#  This file is part of rgmining-fraudar.
#
#  rgmining-fraudar is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  rgmining-fraudar is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with rgmining-fraudar. If not, see <http://www.gnu.org/licenses/>.
#
#  This file was originally made by Bryan Hooi et al,
#  and distributed under Apache License, Version 2.0.
#
"""
This module contains functions that run the greedy detector for dense regions in a sparse matrix.
use aveDegree or sqrtWeightedAveDegree or logWeightedAveDegree on a sparse matrix,
which returns ((rowSet, colSet), score) for the most suspicious block.
"""
import random

import numpy as np
from scipy import sparse

from .MinTree import MinTree

np.set_printoptions(linewidth=160)


# given 2 lists corresponding to the edge source and destination,
# this returns the sparse matrix representation of the data
# @profile
[docs]def listToSparseMatrix(edgesSource, edgesDest): m = max(edgesSource) + 1 n = max(edgesDest) + 1 M = sparse.coo_matrix(([1] * len(edgesSource), (edgesSource, edgesDest)), shape=(m, n)) M1 = M > 0 return M1.astype("int")
# reads matrix from file and returns sparse matrix. first 2 columns should be row and column indices of ones. # @profile
[docs]def readData(filename): # dat = np.genfromtxt(filename, delimiter='\t', dtype=int) edgesSource = [] edgesDest = [] with open(filename) as f: for line in f: toks = line.split() edgesSource.append(int(toks[0])) edgesDest.append(int(toks[1])) return listToSparseMatrix(edgesSource, edgesDest)
[docs]def detectMultiple(M, detectFunc, numToDetect): Mcur = M.copy().tolil() res = [] for i in range(numToDetect): ((rowSet, colSet), score) = detectFunc(Mcur) res.append(((rowSet, colSet), score)) (rs, cs) = Mcur.nonzero() for i in range(len(rs)): if rs[i] in rowSet and cs[i] in colSet: Mcur[rs[i], cs[i]] = 0 return res
# inject a clique of size m0 by n0, with density pp. the last parameter testIdx determines the camouflage type. # testIdx = 1: random camouflage, with camouflage density set so each fraudster outputs approximately equal number # of fraudulent and camouflage edges # testIdx = 2: random camouflage, with double the density as in the previous setting # testIdx = 3: biased camouflage, more likely to add camouflage to high degree columns
[docs]def injectCliqueCamo(M, m0, n0, p, testIdx): (m, n) = M.shape M2 = M.copy().tolil() colSum = np.squeeze(M2.sum(axis=0).A) colSumPart = colSum[n0:n] colSumPartPro = np.int_(colSumPart) colIdx = np.arange(n0, n, 1) population = np.repeat(colIdx, colSumPartPro, axis=0) for i in range(m0): # inject clique for j in range(n0): if random.random() < p: M2[i, j] = 1 # inject camo if testIdx == 1: thres = p * n0 / (n - n0) for j in range(n0, n): if random.random() < thres: M2[i, j] = 1 if testIdx == 2: thres = 2 * p * n0 / (n - n0) for j in range(n0, n): if random.random() < thres: M2[i, j] = 1 # biased camo if testIdx == 3: colRplmt = random.sample(population, int(n0 * p)) M2[i, colRplmt] = 1 return M2.tocsc()
# sum of weighted edges in rowSet and colSet, plus node suspiciousness values, in matrix M
[docs]def c2Score(M, rowSet, colSet, nodeSusp): suspTotal = nodeSusp[0][list(rowSet)].sum() + nodeSusp[1][list(colSet)].sum() return M[list(rowSet), :][:, list(colSet)].sum(axis=None) + suspTotal
[docs]def jaccard(pred, actual): intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1])) unionSize = len(set.union(pred[0], actual[0])) + len(set.union(pred[1], actual[1])) return intersectSize / unionSize
[docs]def getPrecision(pred, actual): intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1])) return intersectSize / (len(pred[0]) + len(pred[1]))
[docs]def getRecall(pred, actual): intersectSize = len(set.intersection(pred[0], actual[0])) + len(set.intersection(pred[1], actual[1])) return intersectSize / (len(actual[0]) + len(actual[1]))
[docs]def getFMeasure(pred, actual): prec = getPrecision(pred, actual) rec = getRecall(pred, actual) return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec))
[docs]def getRowPrecision(pred, actual, idx): intersectSize = len(set.intersection(pred[idx], actual[idx])) return intersectSize / len(pred[idx])
[docs]def getRowRecall(pred, actual, idx): intersectSize = len(set.intersection(pred[idx], actual[idx])) return intersectSize / len(actual[idx])
[docs]def getRowFMeasure(pred, actual, idx): prec = getRowPrecision(pred, actual, idx) rec = getRowRecall(pred, actual, idx) return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec))
# run greedy algorithm using square root column weights
[docs]def sqrtWeightedAveDegree(M, nodeSusp=None): (m, n) = M.shape colSums = M.sum(axis=0) colWeights = 1.0 / np.sqrt(np.squeeze(colSums) + 5) colDiag = sparse.lil_matrix((n, n)) colDiag.setdiag(colWeights) W = M * colDiag return fastGreedyDecreasing(W, colWeights, nodeSusp)
# run greedy algorithm using logarithmic weights
[docs]def logWeightedAveDegree(M, nodeSusp=None): (m, n) = M.shape colSums = M.sum(axis=0) colWeights = np.squeeze(np.array(1.0 / np.log(np.squeeze(colSums) + 5))) colDiag = sparse.lil_matrix((n, n)) colDiag.setdiag(colWeights) W = M * colDiag print("finished computing weight matrix") return fastGreedyDecreasing(W, colWeights, nodeSusp)
[docs]def aveDegree(M, nodeSusp=None): (m, n) = M.shape return fastGreedyDecreasing(M, [1] * n, nodeSusp)
[docs]def subsetAboveDegree(M, col_thres, row_thres): M = M.tocsc() (m, n) = M.shape colSums = np.squeeze(np.array(M.sum(axis=0))) rowSums = np.squeeze(np.array(M.sum(axis=1))) colValid = colSums > col_thres rowValid = rowSums > row_thres M1 = M[:, colValid].tocsr() M2 = M1[rowValid, :] rowFilter = [i for i in range(m) if rowValid[i]] colFilter = [i for i in range(n) if colValid[i]] return M2, rowFilter, colFilter
# @profile
[docs]def fastGreedyDecreasing(M, colWeights, nodeSusp=None): (m, n) = M.shape if nodeSusp is None: nodeSusp = (np.zeros(m), np.zeros(n)) Md = M.todok() Ml = M.tolil() Mlt = M.transpose().tolil() rowSet = set(range(0, m)) colSet = set(range(0, n)) curScore = c2Score(M, rowSet, colSet, nodeSusp) bestAveScore = curScore / (len(rowSet) + len(colSet)) bestSets = (rowSet, colSet) print("finished initialization") rowDeltas = ( np.squeeze(M.sum(axis=1).A) + nodeSusp[0] ) # contribution of this row to total weight, i.e. *decrease* in total weight when *removing* this row colDeltas = np.squeeze(M.sum(axis=0).A) + nodeSusp[1] print("finished setting deltas") rowTree = MinTree(rowDeltas) colTree = MinTree(colDeltas) print("finished building min trees") numDeleted = 0 deleted = [] bestNumDeleted = 0 while rowSet and colSet: if (len(colSet) + len(rowSet)) % 100000 == 0: print("current set size = %d" % (len(colSet) + len(rowSet),)) (nextRow, rowDelt) = rowTree.getMin() (nextCol, colDelt) = colTree.getMin() if rowDelt <= colDelt: curScore -= rowDelt for j in Ml.rows[nextRow]: delt = colWeights[j] colTree.changeVal(j, -colWeights[j]) rowSet -= {nextRow} rowTree.changeVal(nextRow, float("inf")) deleted.append((0, nextRow)) else: curScore -= colDelt for i in Mlt.rows[nextCol]: delt = colWeights[nextCol] rowTree.changeVal(i, -colWeights[nextCol]) colSet -= {nextCol} colTree.changeVal(nextCol, float("inf")) deleted.append((1, nextCol)) numDeleted += 1 curAveScore = curScore / (len(colSet) + len(rowSet)) if curAveScore > bestAveScore: bestAveScore = curAveScore bestNumDeleted = numDeleted # reconstruct the best row and column sets finalRowSet = set(range(m)) finalColSet = set(range(n)) for i in range(bestNumDeleted): if deleted[i][0] == 0: finalRowSet.remove(deleted[i][1]) else: finalColSet.remove(deleted[i][1]) return (finalRowSet, finalColSet), bestAveScore