如何高效地维护传递闭包表?

14
我在我的关系型数据库(Firebird)中有一个DAG,其中包含两个表edge和node(邻接列表模型)。我想递归查询它们,但发现递归查询非常低效。因此,我尝试实现触发器来维护传递闭包,遵循Dong等人的论文http://homepages.inf.ed.ac.uk/libkin/papers/tc-sql.pdf
现在,SELECT非常快,但DELETE非常慢,因为几乎整个图形都会被复制以进行单个删除。更糟糕的是,并发更新似乎不可能。
有没有更好的方法来实现这个?
编辑
我进行了一些实验,并向TC表引入了引用计数器。有了它,删除就很快了。我编写了一些简单的测试用例,但我不确定我是否做得对。这是我目前的进展:
CREATE GENERATOR graph_tc_seq;

CREATE TABLE EDGE (
    parent DECIMAL(10, 0) NOT NULL,
    child DECIMAL(10, 0) NOT NULL,
    PRIMARY KEY (parent, child)
);

CREATE TABLE GRAPH_TC (
    parent DECIMAL(10, 0) NOT NULL,
    child DECIMAL(10, 0) NOT NULL,
    refcount DECIMAL(9, 0),
    PRIMARY KEY (parent, child)
);

CREATE TABLE GRAPH_TC_TEMP (
    session_id DECIMAL(9, 0),
    parent DECIMAL(10, 0),
    child DECIMAL(10, 0)
);

CREATE PROCEDURE GRAPH_TC_CREATE (p_parent DECIMAL(10, 0), c_child DECIMAL(10, 0))
AS 
    declare variable tp_parent DECIMAL(10,0);
    declare variable tc_child DECIMAL(10,0);
    declare variable session_id DECIMAL(9,0);
    declare variable refs DECIMAL(9,0);
begin
    session_id = gen_id(graph_tc_seq,1);
    insert into graph_tc_temp (parent, child, session_id, refcount) values (:p_parent, :p_parent, :session_id, 1);
    insert into graph_tc_temp (parent, child, session_id, refcount) values (:c_child, :c_child, :session_id, 1);
    insert into graph_tc_temp (parent, child, session_id, refcount) values (:p_parent, :c_child, :session_id, 1);
    insert into graph_tc_temp (parent, child, session_id, refcount) select distinct :p_parent, child, :session_id, refcount from graph_tc where parent = :c_child and not parent = child;
    insert into graph_tc_temp (child, parent, session_id, refcount) select distinct :c_child, parent, :session_id, refcount from graph_tc where child = :p_parent and not parent = child;
    insert into graph_tc_temp (parent, child, session_id, refcount) select distinct a.parent, b.child, :session_id, a.refcount*b.refcount from graph_tc a, graph_tc b where a.child = :p_parent and b.parent = :c_child and not a.parent = a.child and not b.parent = b.child;
    for select parent, child, refcount from graph_tc_temp e where session_id= :session_id and exists (select * from graph_tc t where t.parent = e.parent and t.child = e.child ) into :tp_parent, :tc_child, :refs do begin
        update graph_tc set refcount=refcount+ :refs where parent = :tp_parent and child = :tc_child;
    end
    insert into graph_tc (parent, child, refcount) select parent, child, refcount from graph_tc_temp e where session_id = :session_id and not exists (select * from graph_tc t where t.parent = e.parent and t.child = e.child);
    delete from graph_tc_temp where session_id = :session_id;
end ^

CREATE PROCEDURE GRAPH_TC_DELETE (p_parent DECIMAL(10, 0), c_child DECIMAL(10, 0))
AS 
    declare variable tp_parent DECIMAL(10,0);
    declare variable tc_child DECIMAL(10,0);
    declare variable refs DECIMAL(9,0);
begin
    delete from graph_tc where parent = :p_parent and child = :p_parent and refcount <= 1;
    update graph_tc set refcount = refcount - 1 where parent = :p_parent and child = :p_parent and refcount > 1;
    delete from graph_tc where parent = :c_child and child = :c_child and refcount <= 1;
    update graph_tc set refcount = refcount - 1 where parent = :c_child and child = :c_child and refcount > 1;
    delete from graph_tc where parent = :p_parent and child = :c_child and refcount <= 1;
    update graph_tc set refcount = refcount - 1 where parent = :p_parent and child = :c_child and refcount > 1;
    for select distinct :p_parent,  b.child, refcount from graph_tc b where b.parent = :c_child and not b.parent = b.child into :tp_parent, :tc_child, :refs do begin
        delete from graph_tc where parent = :tp_parent and child = :tc_child and refcount <= :refs;
        update graph_tc set refcount = refcount - :refs where parent = :tp_parent and child = :tc_child and refcount > :refs;
    end
    for select distinct :c_child,  b.parent, refcount from graph_tc b where b.child = :p_parent and not b.parent = b.child into :tc_child, :tp_parent, :refs do begin
        delete from graph_tc where child = :tc_child and parent = :tp_parent and refcount <= :refs;
        update graph_tc set refcount = refcount - :refs where child = :tc_child and parent = :tp_parent and refcount > :refs;
    end
    for select distinct a.parent, b.child, a.refcount*b.refcount from graph_tc a, graph_tc b where not a.parent = a.child and not b.parent = b.child and a.child = :p_parent and b.parent = :c_child into :tp_parent, :tc_child, :refs do begin
        delete from graph_tc where parent = :tp_parent and child = :tc_child and refcount <= :refs;
        update graph_tc set refcount = refcount - :refs where parent = :tp_parent and child = :tc_child and refcount > :refs;
    end
end ^

CREATE TRIGGER GRAPH_TC_AFTER_INSERT FOR EDGE AFTER INSERT as
begin
    execute procedure graph_tc_create(new.parent,new.child);
end ^

CREATE TRIGGER GRAPH_TC_AFTER_UPDATE FOR EDGE AFTER UPDATE as
begin
    if ((new.parent <> old.parent) or (new.child <> old.child)) then begin
    execute procedure graph_tc_delete(old.parent,old.child);
    execute procedure graph_tc_create(new.parent,new.child);
    end
end ^

CREATE TRIGGER GRAPH_TC_AFTER_DELETE FOR EDGE AFTER DELETE as
begin
    execute procedure graph_tc_delete(old.parent,old.child);
end ^

这是我的想法,但我认为其他人已经实现了一个TC。他们在做同样的事情吗?
我有一些测试用例,但我不确定在更大的图形中是否可能出现不一致。
并发方面怎么样?当两个同时事务想要更新图形时,我认为这种方法会失败,对吗?
编辑
我在我的代码中发现了一些错误,并希望与您分享修复后的版本。
我找到了一篇很好的文章:http://www.codeproject.com/Articles/22824/A-Model-to-Represent-Directed-Acyclic-Graphs-DAG-o。还有更多有趣的文章或科学论文,采用不同的方法吗?

你能否包含(相关的部分)DDL和触发器定义? - Mark Rotteveel
2
考虑使用GTT(全局临时表)代替普通表格来存储GRAPH_TC_TEMP - Mark Rotteveel
您想递归查询表格,还是查询特定的图和连接? - ChuckCottrill
@ChuckCottrill 我想递归查询所有连接。例如:节点X的所有(间接)父节点或子节点是什么?哪些节点在节点X和Y之间? - Christoph Walesch
@ChristophWalesch,你提到发现了一些错误并想分享修复版本。你的修复版本在哪里? - sryscad
@sryscad,您看到的是修复后的版本。早期版本存在我提到的错误。 - Christoph Walesch
3个回答

1

2
我知道图形数据库是理想的解决方案;但对于每个小于100k边缘的两个图形,我不会添加新的数据库。 - Christoph Walesch

1
我刚刚通过扩展到转移自反闭包表模型来修复了一个缓慢的删除操作,该模型在这里描述:http://www.dba-oracle.com/t_sql_patterns_incremental_eval.htm。维护其中的路径计数需要更多的工作,但当删除从每个单独的删除操作需要6秒时,它付出了巨大的回报(现在我可以删除图中的每个关系,然后在总共14秒内添加所有关系,共4,000个)。

而且作为奖励,最短路径长度可以类似于路径总数一样进行维护。http://www.tjhsst.edu/~rlatimer/acm/DatabaseSystems/ShortestDistanceinFirstOrderLogicSQLp698-pangTODS-Oct05.pdf - nclu

0

尝试为相关的where子句(例如:childparent)创建索引。

我不熟悉Firebird,但请查看它的“firebird describe”如何工作,并检查它是否使用了适当的索引以加速您在过程中进行的选择。

即使创建索引会导致更新/删除/插入性能下降,在您的情况下,它可以改善结果。


真正的实现有索引;我只是没有将“CREATE INDEX”语句复制到上面的代码中。 - Christoph Walesch

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接