迭代连通分量算法

Question

迭代连通分量算法

6

我有一个二分图，正在寻找将其划分为连通组件的最有效迭代方式。我的递归版本在大型数据集上开始溢出堆栈。我愿意从任何语言/伪代码移植，但为了完整起见，我将使用C#进行编码。

我的现有代码专门针对我的数据类型。其中一个分区是蛋白质，另一个是光谱。Map和Set是C++ stdlib的替代品。

void recursivelyAssignProteinToCluster (long proteinId,
                                        long clusterId,
                                        Set<long> spectrumSet,
                                        Map<long, Set<long>> spectrumSetByProteinId,
                                        Map<long, Set<long>> proteinSetBySpectrumId,
                                        Map<long, long> clusterByProteinId)
{
    // try to assign the protein to the current cluster
    var insertResult = clusterByProteinId.Insert(proteinId, clusterId);
    if (!insertResult.WasInserted)
        return;

    // recursively add all "cousin" proteins to the current cluster
    foreach (long spectrumId in spectrumSet)
        foreach (var cousinProteinId in proteinSetBySpectrumId[spectrumId])
        {
            if (proteinId != cousinProteinId)
            {
                Set<long> cousinSpectrumSet = spectrumSetByProteinId[cousinProteinId];
                recursivelyAssignProteinToCluster(cousinProteinId,
                                                  clusterId,
                                                  cousinSpectrumSet,
                                                  spectrumSetByProteinId,
                                                  proteinSetBySpectrumId,
                                                  clusterByProteinId);
            }
        }
}

Map<long, long> calculateProteinClusters (NHibernate.ISession session)
{
    var spectrumSetByProteinId = new Map<long, Set<long>>();
    var proteinSetBySpectrumId = new Map<long, Set<long>>();

    var query = session.CreateQuery("SELECT pi.Protein.id, psm.Spectrum.id " + GetFilteredQueryString(FromProtein, ProteinToPeptideSpectrumMatch));

    foreach (var queryRow in query.List<object[]>())
    {
        long proteinId = (long) queryRow[0];
        long spectrumId = (long) queryRow[1];

        spectrumSetByProteinId[proteinId].Add(spectrumId);
        proteinSetBySpectrumId[spectrumId].Add(proteinId);
    }

    var clusterByProteinId = new Map<long, long>();
    int clusterId = 0;

    foreach (var pair in spectrumSetByProteinId)
    {
        long proteinId = pair.Key;

        // for each protein without a cluster assignment, make a new cluster
        if (!clusterByProteinId.Contains(proteinId))
        {
            ++clusterId;

            recursivelyAssignProteinToCluster(proteinId,
                                              clusterId,
                                              pair.Value,
                                              spectrumSetByProteinId,
                                              proteinSetBySpectrumId,
                                              clusterByProteinId);
        }
    }

    return clusterByProteinId;
}

正如ShinTakezou建议的那样，我进行了重构，将堆栈放在堆上，它运行得很好。我使用了digEmAll示例中的DepthFirstSearch方法。

var clusterByProteinId = new Map<long, long>();
int clusterId = 0;
var clusterStack = new Stack<KeyValuePair<long, Set<long>>>();

foreach (var pair in spectrumSetByProteinId)
{
    long proteinId = pair.Key;

    if (clusterByProteinId.Contains(proteinId))
        continue;

    // for each protein without a cluster assignment, make a new cluster
    ++clusterId;
    clusterStack.Push(new KeyValuePair<long, Set<long>>(proteinId, spectrumSetByProteinId[proteinId]));
    while (clusterStack.Count > 0)
    {
        var kvp = clusterStack.Pop();

        // try to assign the protein to the current cluster
        var insertResult = clusterByProteinId.Insert(kvp.Key, clusterId);
        if (!insertResult.WasInserted)
            continue;

        // add all "cousin" proteins to the current cluster
        foreach (long spectrumId in kvp.Value)
            foreach (var cousinProteinId in proteinSetBySpectrumId[spectrumId])
                if (!clusterByProteinId.Contains(cousinProteinId))
                    clusterStack.Push(new KeyValuePair<long, Set<long>>(cousinProteinId, spectrumSetByProteinId[cousinProteinId]));
    }
}

- Matt Chambers

发布你的递归方法，有人在这里为你将其转化为迭代方法。 - Brannon

2

当递归消耗太多栈时，作为第一次尝试保持相同算法，您可以尝试将递归转换为从堆栈（对于函数的所需参数，在该堆栈上调用相同函数成为推送）中消耗数据的循环。当然，我指的是用户实现的LIFO列表，这需要一些工作，但是这样您就受到堆而不是栈的限制。（也许这仅适用于尾递归？我得想想...） - ShinTakezou

“Components”指的是子图吗？ - Beta

1个回答

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- digEmAll · Accepted Answer

下面是一个帮助类的示例，它保存一个无向图，并允许获取它的连通组件（迭代方式）:

public class Graph<T>
{
    public Dictionary<T, HashSet<T>> nodesNeighbors;
    public IEnumerable<T> Nodes
    {
        get { return nodesNeighbors.Keys; }
    }
    public Graph()
    {
        this.nodesNeighbors = new Dictionary<T, HashSet<T>>();
    }
    public void AddNode(T node)
    {
        this.nodesNeighbors.Add(node, new HashSet<T>());
    }
    public void AddNodes(IEnumerable<T> nodes)
    {
        foreach (var n in nodes)
            this.AddNode(n);
    }
    public void AddArc(T from, T to)
    {
        this.nodesNeighbors[from].Add(to);
        this.nodesNeighbors[to].Add(from);
    }
    public bool ContainsNode(T node)
    {
        return this.nodesNeighbors.ContainsKey(node);
    }
    public IEnumerable<T> GetNeighbors(T node)
    {
        return nodesNeighbors[node];
    }
    public IEnumerable<T> DepthFirstSearch(T nodeStart)
    {
        var stack = new Stack<T>();
        var visitedNodes = new HashSet<T>();
        stack.Push(nodeStart);
        while (stack.Count > 0)
        {
            var curr = stack.Pop();
            if (!visitedNodes.Contains(curr))
            {
                visitedNodes.Add(curr);
                yield return curr;
                foreach (var next in this.GetNeighbors(curr))
                {
                    if (!visitedNodes.Contains(next))
                        stack.Push(next);
                }
            }
        }
    }
    public Graph<T> GetSubGraph(IEnumerable<T> nodes)
    {
        Graph<T> g = new Graph<T>();
        g.AddNodes(nodes);
        foreach (var n in g.Nodes.ToList())
        {
            foreach (var neigh in this.GetNeighbors(n))
                g.AddArc(n, neigh);
        }
        return g;
    }

    public IEnumerable<Graph<T>> GetConnectedComponents()
    {
        var visitedNodes = new HashSet<T>();
        var components = new List<Graph<T>>();

        foreach (var node in this.Nodes)
        {
            if (!visitedNodes.Contains(node))
            {
                var subGraph = GetSubGraph(this.DepthFirstSearch(node));
                components.Add(subGraph);
                visitedNodes.UnionWith(subGraph.Nodes);
            }
        }
        return components;
    }
}

使用方法：

static void Main(string[] args)
{
    var g = new Graph<long>();
    g.AddNodes(new long[] { 1, 2, 3, 4, 5, 6, 7, 8, 9 });
    g.AddArc(1, 2);
    g.AddArc(1, 3);

    g.AddArc(9, 6);
    g.AddArc(6, 7);
    g.AddArc(6, 8);

    g.AddArc(4, 5);

    var subGraphs = g.GetConnectedComponents();

}

您可以使用Graph<>类替代您的映射，或者如果您想继续使用映射，请查看代码，这很容易理解（在类内部使用了Dictionary<T,HashSet<T>>来保存节点和弧，因此与您的方法非常相似）。