如何处理Python中复杂的列表和字典

3

我一直在使用Python编写程序,需要处理列表字典中的复杂问题。问题是我需要将这些数据转换成字典并进行排序。此函数的输入来源是树结构。 我分享的代码可以工作,但运行时间很长。 这里我想问的是有没有什么方法可以使此函数在Python中运行更快?如果您想知道,我使用的是Python 3.7.3。 我想改进此代码的原因是当我尝试为此函数创建输入数据时需要大约3-4小时,但运行此函数需要大约21-22小时(这真的让我震惊)。

下面是我所输入数据的结构:

[ # list of trees
    [ # list of leaf nodes
        { # dict of spg that contain in leaf
            (tau_1,tau_2):[(ent1,ent2),(ent1,ent3)]
        }
    ]
]

这是我处理的样例数据:
[
    [
        {(7, 8): [(1, 28), (1, 29)]},
        {(7, 8): [(1, 30)]},
        {(7, 8): [(1, 32)]},
        {(7, 8): [(1, 21), (1, 22)]},
        {(7, 8): [(1, 18)]},
        {(7, 8): [(1, 19), (1, 31)]},
        {(7, 8): [(1, 13), (1, 14)]},
        {(7, 8): [(1, 16)]},
        {(7, 8): [(1, 15)]},
        {(7, 8): [(1, 11), (1, 12)]},
        {(7, 8): [(1, 17)]},
        {(7, 8): [(1, 23), (1, 26)]},
        {(7, 8): [(1, 20), (1, 27)]},
        {(7, 8): [(1, 7)]},
        {(7, 8): [(1, 9), (1, 10)]},
        {(7, 8): [(1, 8)]},
        {(7, 8): [(1, 5), (1, 6)]},
        {(7, 8): [(1, 24), (1, 25)]},
        {
            (0, 1): [(1, 2)],
            (1, 2): [(1, 2)],
            (2, 3): [(1, 2)],
            (3, 4): [(1, 2)],
            (4, 5): [(1, 2)],
            (5, 6): [(1, 2)],
            (6, 7): [(1, 2)],
            (7, 8): [(1, 2)],
        },
        {
            (0, 1): [(1, 3), (1, 4)],
            (1, 2): [(1, 3), (1, 4)],
            (2, 3): [(1, 3), (1, 4)],
            (3, 4): [(1, 3), (1, 4)],
            (4, 5): [(1, 3), (1, 4)],
            (5, 6): [(1, 3), (1, 4)],
            (6, 7): [(1, 3), (1, 4)],
            (7, 8): [(1, 3), (1, 4)],
        },
    ],
    [
        {(7, 8): [(2, 28), (2, 29)]},
        {(7, 8): [(2, 30)]},
        {(7, 8): [(2, 32)]},
        {(7, 8): [(2, 21), (2, 22)]},
        {(7, 8): [(2, 18)]},
        {(7, 8): [(2, 19), (2, 31)]},
        {(7, 8): [(2, 13), (2, 14)]},
        {(7, 8): [(2, 16)]},
        {(7, 8): [(2, 24), (2, 25)]},
        {(7, 8): [(2, 23)]},
        {(7, 8): [(2, 26), (2, 27)]},
        {(7, 8): [(2, 20)]},
        {(7, 8): [(2, 15)]},
        {(7, 8): [(2, 11), (2, 12)]},
        {(7, 8): [(2, 17)]},
        {(7, 8): [(2, 9), (2, 10)]},
        {(7, 8): [(2, 1)]},
        {(7, 8): [(2, 3), (2, 4)]},
        {(7, 8): [(2, 7), (2, 8)]},
        {(7, 8): [(2, 5), (2, 6)]},
    ]
]

这是参数的值,以防您需要了解:
tau_lenght: 50
ent: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]
gamma: 1
tau_start: [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99]
tau_end: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100]
time_range: 2
json_output: 'output_json'

以下是代码:

import json
def prepare_output(data,tau_lenght,ent,gamma,tau_start,tau_end,time_range,json_output):
    # step 1:
    # change data to dict where pair of tau became key
    # list of group of entity into sorted based on o_r
    spgs = {} # collect all spg per 2 tau based on entity
    for tau in range(tau_lenght-1): # for every tau
        temp_tau = [] # containing result per 2 tau
        for o_r in ent: # for every o_r
            temp = [] # contain all SPG whic contain o_r on tau
            for lst in data: # lst is the result on every decision tree
                for leaf in lst: # leaf from decision tree
                    if (tau,tau+1) in leaf.keys(): # looking for pair tau existence
                        # if leaf contain entity o_r
                        if list_contain(leaf[(tau,tau+1)],o_r):
                            # adding new crew candidate
                            # helper.list_of_ent is working for change [(1,2),(2,3)] into [1,2,3]
                            temp.append(helper.list_of_ent(leaf[(tau,tau+1)]))
            if temp: # if temp not empty
                temp_tau.append(temp) # adding temp into temp_tau
        if temp_tau: # if temp_tau not empty
            # connect all candidate into one spg
            cur_tau = []
            for lst in temp_tau:
                for pairs in lst:
                    cur_tau.append(pairs)
            spgs[(tau,tau+1)] = set(tuple(i) for i in cur_tau) # menambahkan 
            # temp_tau into res
    
    # step 2:
    # change into dict which include group of ent as key,
    # and list of (tau1,tau2) as value
    result_per_group = {}
    for tau in spgs.keys():
        for group in spgs[tau]:
            if group in result_per_group.keys():
                result_per_group[group].append(tau)
            else:
                result_per_group[group] = [tau]
    
    # step 3:
    # remove all candidate > gamma
    for group in result_per_group.keys():
        lst = result_per_group[group]
        for tau_idx in range(len(lst)-1):
            tau1_end = tau_end[tau_idx]
            tau2_start = tau_start[tau_idx+1]
            if (tau2_start - tau1_end) > (time_range * gamma):
                result_per_group.pop(group)
                break
    
    global output
    # helper.dict_str_key is fuction I made to convert key of dict into string
    output['output'] = helper.dict_str_key(result_per_group)
    # to convert into json file
    helper.to_json(json_output,output)

# this some function from helper that I use in here
def list_of_ent(lst):
    res = set()
    for e1,e2 in lst:
        res.add(e1)
        res.add(e2)
    return sorted(list(res))

def dict_str_key(data):
    data_str_key = {}
    for key in data.keys():
        data_str_key[str(key)] = data[key]
    return data_str_key

def to_json(name,data):
    name = 'output/'+name+'.json'
    with open(name, 'w') as file:
        json.dump(data, file)

为了让问题更清晰,我将代码聚焦在三个大循环步骤上。

第一步用于生成如下输出结果:

{
    (tau1, tau2): {
        (ent1,ent2),
        (ent8,ent19)
    }
}

步骤2用于使输出呈现如下效果:

{
    (ent1,ent2,ent3):[(tau1,tau2),(tau2,tau3)]
}

步骤3输出格式与步骤2相同

p.s:tau 是时间的组(在此示例中,我使用2帧作为1个tau)

这是我想问的问题,如果我说得太长或者问题有点复杂,请见谅。我希望我的问题很清楚。非常期待任何回应和支持,谢谢。


这个程序对我来说太长了。不过,尝试使用Python性能分析器运行它,并查看哪个函数花费了最多的时间,可以使用缩小的输入以获得更快的执行速度。例如 https://docs.python.org/3/library/profile.html - Albin Paul
1
除了对代码进行分析的评论是绝对正确的之外,我建议作为一般指导,尽量使用列表推导式而不是for循环以获得更好的性能。例如: temp = [] # 包含所有在tau上包含o_r的SPG ... temp.append(helper.list_of_ent(leaf[(tau,tau+1)])) 可以通过列表推导式来简化: temp = [helper.list_of_ent(leaf[(tau, tau + 1)] for (tau, tau + 1) in leaf.keys() for leaf in lst if <add_your_conditions>] - Arnab De
1个回答

0

没有完整的代码来测试输出,这就更难做了,但似乎有一些冗余的过程,你正在将元素添加到一个列表的列表中,然后将其展平并作为集合添加到字典中。通过删除它,您可以提高一些速度和内存,并直接将其添加到字典中。

还有一些其他的调整可以做,比如使用f-strings而不是字符串连接,使用列表推导式,并消除在循环中进行相同数学运算(time_range * gamma),而是直接通过内存引用它。

但与您的第一步流程相比,这些都是次要的调整(时间复杂度约为N^4)。我不确定它是否更大,因为我看不到您在该循环中使用的函数,但调整它以减少计算次数将为节省时间提供最大的好处。

import json
from collections import defaultdict

def prepare_output(data,tau_lenght,ent,gamma,tau_start,tau_end,time_range,json_output):
    # step 1:
    # change data to dict where pair of tau became key
    # list of group of entity into sorted based on o_r
    spgs = defaultdict(set) # collect all spg per 2 tau based on entity
    for tau in range(tau_lenght-1): # for every tau
        for o_r in ent: # for every o_r
            for lst in data: # lst is the result on every decision tree
                for leaf in lst: # leaf from decision tree
                    if (tau,tau+1) in leaf: # looking for pair tau existence
                        # if leaf contain entity o_r
                        if list_contain(leaf[(tau,tau+1)],o_r):
                            # adding new crew candidate
                            # helper.list_of_ent is working for change [(1,2),(2,3)] into [1,2,3]
                            list_unique_elements = helper.list_of_ent(leaf[(tau,tau+1)])
                            if (tau,tau+1) in spgs: # menambahkan 
                                dict[(tau,tau+1)].add(tuple(list_unique_elements))
                            else:
                                dict[(tau,tau+1)] = {tuple(list_unique_elements)}
    
    # step 2:
    # change into dict which include group of ent as key,
    # and list of (tau1,tau2) as value
    result_per_group = {}
    for tau, value in spgs.items():
        for group in value:
            if group in result_per_group:
                result_per_group[group].append(tau)
            else: 
                result_per_group[group] = [tau]
    
    # step 3:
    # remove all candidate > gamma
    time_gamma_multiplied = time_range * gamma
    for group,lst in result_per_group.items():
        for tau_idx in lst[:-1]:
            if (tau_start[tau_idx+1] - tau_end[tau_idx]) > time_gamma_multiplied:
                result_per_group.pop(group)
                break
    
    global output
    # helper.dict_str_key is fuction I made to convert key of dict into string
    output['output'] = helper.dict_str_key(result_per_group)
    # to convert into json file
    helper.to_json(json_output,output)

# this some function from helper that I use in here
def list_of_ent(lst):
    res = set()
    for e1,e2 in lst:
        res.add(e1)
        res.add(e2)
    return sorted(list(res))

def dict_str_key(data):
    return {data_str_key[str(key)]: value for key, value in data.items()}

def to_json(name,data):
    with open(f'output/{name}.json', 'w') as file:
        json.dump(data, file)

显然,对代码进行分析会为您提供最佳指标,以找出占用大量时间的冗余进程或可通过微小调整减少时间的进程。

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接