Source code for fraudar.export.greedy

#
#  greedy.py
#
#  Copyright (c) 2016-2025 Junpei Kawamoto
#
#  This file is part of rgmining-fraudar.
#
#  rgmining-fraudar is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  rgmining-fraudar is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with rgmining-fraudar. If not, see <http://www.gnu.org/licenses/>.
#
#  This file was originally made by Bryan Hooi et al,
#  and distributed under Apache License, Version 2.0.
#
# mypy: ignore-errors
"""
This module contains functions that run the greedy detector for dense regions in a sparse matrix.
use aveDegree or sqrtWeightedAveDegree or logWeightedAveDegree on a sparse matrix,
which returns ((rowSet, colSet), score) for the most suspicious block.
"""

import random

import numpy as np
from scipy import sparse

from .MinTree import MinTree

np.set_printoptions(linewidth=160)


# given 2 lists corresponding to the edge source and destination,
# this returns the sparse matrix representation of the data
# @profile

[docs]
def listToSparseMatrix(edgesSource, edgesDest):
    m = max(edgesSource) + 1
    n = max(edgesDest) + 1
    M = sparse.coo_matrix(
        ([1] * len(edgesSource), (edgesSource, edgesDest)), shape=(m, n)
    )
    M1 = M > 0
    return M1.astype("int")



# reads matrix from file and returns sparse matrix. first 2 columns should be row and column indices of ones.
# @profile

[docs]
def readData(filename):
    # dat = np.genfromtxt(filename, delimiter='\t', dtype=int)
    edgesSource = []
    edgesDest = []
    with open(filename) as f:
        for line in f:
            toks = line.split()
            edgesSource.append(int(toks[0]))
            edgesDest.append(int(toks[1]))
    return listToSparseMatrix(edgesSource, edgesDest)




[docs]
def detectMultiple(M, detectFunc, numToDetect):
    Mcur = M.copy().tolil()
    res = []
    for i in range(numToDetect):
        ((rowSet, colSet), score) = detectFunc(Mcur)
        res.append(((rowSet, colSet), score))
        (rs, cs) = Mcur.nonzero()
        for i in range(len(rs)):
            if rs[i] in rowSet and cs[i] in colSet:
                Mcur[rs[i], cs[i]] = 0
    return res



# inject a clique of size m0 by n0, with density pp. the last parameter testIdx determines the camouflage type.
# testIdx = 1: random camouflage, with camouflage density set so each fraudster outputs approximately equal number
#              of fraudulent and camouflage edges
# testIdx = 2: random camouflage, with double the density as in the previous setting
# testIdx = 3: biased camouflage, more likely to add camouflage to high degree columns

[docs]
def injectCliqueCamo(M, m0, n0, p, testIdx):
    (m, n) = M.shape
    M2 = M.copy().tolil()

    colSum = np.squeeze(M2.sum(axis=0).A)
    colSumPart = colSum[n0:n]
    colSumPartPro = np.int_(colSumPart)
    colIdx = np.arange(n0, n, 1)
    population = np.repeat(colIdx, colSumPartPro, axis=0)

    for i in range(m0):
        # inject clique
        for j in range(n0):
            if random.random() < p:
                M2[i, j] = 1
        # inject camo
        if testIdx == 1:
            thres = p * n0 / (n - n0)
            for j in range(n0, n):
                if random.random() < thres:
                    M2[i, j] = 1
        if testIdx == 2:
            thres = 2 * p * n0 / (n - n0)
            for j in range(n0, n):
                if random.random() < thres:
                    M2[i, j] = 1
        # biased camo
        if testIdx == 3:
            colRplmt = random.sample(population, int(n0 * p))
            M2[i, colRplmt] = 1

    return M2.tocsc()



# sum of weighted edges in rowSet and colSet, plus node suspiciousness values, in matrix M

[docs]
def c2Score(M, rowSet, colSet, nodeSusp):
    suspTotal = (
        nodeSusp[0][list(rowSet)].sum() + nodeSusp[1][list(colSet)].sum()
    )
    return M[list(rowSet), :][:, list(colSet)].sum(axis=None) + suspTotal




[docs]
def jaccard(pred, actual):
    intersectSize = len(set.intersection(pred[0], actual[0])) + len(
        set.intersection(pred[1], actual[1])
    )
    unionSize = len(set.union(pred[0], actual[0])) + len(
        set.union(pred[1], actual[1])
    )
    return intersectSize / unionSize




[docs]
def getPrecision(pred, actual):
    intersectSize = len(set.intersection(pred[0], actual[0])) + len(
        set.intersection(pred[1], actual[1])
    )
    return intersectSize / (len(pred[0]) + len(pred[1]))




[docs]
def getRecall(pred, actual):
    intersectSize = len(set.intersection(pred[0], actual[0])) + len(
        set.intersection(pred[1], actual[1])
    )
    return intersectSize / (len(actual[0]) + len(actual[1]))




[docs]
def getFMeasure(pred, actual):
    prec = getPrecision(pred, actual)
    rec = getRecall(pred, actual)
    return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec))




[docs]
def getRowPrecision(pred, actual, idx):
    intersectSize = len(set.intersection(pred[idx], actual[idx]))
    return intersectSize / len(pred[idx])




[docs]
def getRowRecall(pred, actual, idx):
    intersectSize = len(set.intersection(pred[idx], actual[idx]))
    return intersectSize / len(actual[idx])




[docs]
def getRowFMeasure(pred, actual, idx):
    prec = getRowPrecision(pred, actual, idx)
    rec = getRowRecall(pred, actual, idx)
    return 0 if (prec + rec == 0) else (2 * prec * rec / (prec + rec))



# run greedy algorithm using square root column weights

[docs]
def sqrtWeightedAveDegree(M, nodeSusp=None):
    (m, n) = M.shape
    colSums = M.sum(axis=0)
    colWeights = 1.0 / np.sqrt(np.squeeze(colSums) + 5)
    colDiag = sparse.lil_matrix((n, n))
    colDiag.setdiag(colWeights)
    W = M * colDiag
    return fastGreedyDecreasing(W, colWeights, nodeSusp)



# run greedy algorithm using logarithmic weights

[docs]
def logWeightedAveDegree(M, nodeSusp=None):
    (m, n) = M.shape
    colSums = M.sum(axis=0)
    colWeights = np.squeeze(np.array(1.0 / np.log(np.squeeze(colSums) + 5)))
    colDiag = sparse.lil_matrix((n, n))
    colDiag.setdiag(colWeights)
    W = M * colDiag
    print("finished computing weight matrix")
    return fastGreedyDecreasing(W, colWeights, nodeSusp)




[docs]
def aveDegree(M, nodeSusp=None):
    (m, n) = M.shape
    return fastGreedyDecreasing(M, [1] * n, nodeSusp)




[docs]
def subsetAboveDegree(M, col_thres, row_thres):
    M = M.tocsc()
    (m, n) = M.shape
    colSums = np.squeeze(np.array(M.sum(axis=0)))
    rowSums = np.squeeze(np.array(M.sum(axis=1)))
    colValid = colSums > col_thres
    rowValid = rowSums > row_thres
    M1 = M[:, colValid].tocsr()
    M2 = M1[rowValid, :]
    rowFilter = [i for i in range(m) if rowValid[i]]
    colFilter = [i for i in range(n) if colValid[i]]
    return M2, rowFilter, colFilter



# @profile

[docs]
def fastGreedyDecreasing(M, colWeights, nodeSusp=None):
    (m, n) = M.shape
    if nodeSusp is None:
        nodeSusp = (np.zeros(m), np.zeros(n))
    Ml = M.tolil()
    Mlt = M.transpose().tolil()
    rowSet = set(range(0, m))
    colSet = set(range(0, n))
    curScore = c2Score(M, rowSet, colSet, nodeSusp)

    bestAveScore = curScore / (len(rowSet) + len(colSet))
    print("finished initialization")
    rowDeltas = (
        np.squeeze(M.sum(axis=1).A) + nodeSusp[0]
    )  # contribution of this row to total weight, i.e. *decrease* in total weight when *removing* this row
    colDeltas = np.squeeze(M.sum(axis=0).A) + nodeSusp[1]
    print("finished setting deltas")
    rowTree = MinTree(rowDeltas)
    colTree = MinTree(colDeltas)
    print("finished building min trees")

    numDeleted = 0
    deleted = []
    bestNumDeleted = 0

    while rowSet and colSet:
        if (len(colSet) + len(rowSet)) % 100000 == 0:
            print("current set size = %d" % (len(colSet) + len(rowSet),))
        (nextRow, rowDelt) = rowTree.getMin()
        (nextCol, colDelt) = colTree.getMin()
        if rowDelt <= colDelt:
            curScore -= rowDelt
            for j in Ml.rows[nextRow]:
                colTree.changeVal(j, -colWeights[j])
            rowSet -= {nextRow}
            rowTree.changeVal(nextRow, float("inf"))
            deleted.append((0, nextRow))
        else:
            curScore -= colDelt
            for i in Mlt.rows[nextCol]:
                rowTree.changeVal(i, -colWeights[nextCol])
            colSet -= {nextCol}
            colTree.changeVal(nextCol, float("inf"))
            deleted.append((1, nextCol))

        numDeleted += 1
        curAveScore = curScore / (len(colSet) + len(rowSet))

        if curAveScore > bestAveScore:
            bestAveScore = curAveScore
            bestNumDeleted = numDeleted

    # reconstruct the best row and column sets
    finalRowSet = set(range(m))
    finalColSet = set(range(n))
    for i in range(bestNumDeleted):
        if deleted[i][0] == 0:
            finalRowSet.remove(deleted[i][1])
        else:
            finalColSet.remove(deleted[i][1])
    return (finalRowSet, finalColSet), bestAveScore