用Python解析XML并转换为Pandas数据框。

3

我正在尝试读取 XML 文件并将其转换为 Pandas。然而,它返回空数据。

这是 XML 结构的示例:

<Instance ID="1">
<MetaInfo StudentID ="DTSU040" TaskID="LP03_PR09.bLK.sh"  DataSource="DeepTutorSummer2014"/>
<ProblemDescription>A car windshield collides with a mosquito, squashing it.</ProblemDescription>
<Question>How does this work tion?</Question>
<Answer>tthis is my best  </Answer>
<Annotation Label="correct(0)|correct_but_incomplete(1)|contradictory(0)|incorrect(0)">
<AdditionalAnnotation ContextRequired="0" ExtraInfoInAnswer="0"/>
<Comments Watch="1"> The student forgot to tell the opposite force. Opposite means opposite direction, which is important here. However, one can argue that the opposite is implied. See the reference answers.</Comments>
</Annotation>
<ReferenceAnswers>
1:  Since the windshield exerts a force on the mosquito, which we can call action, the mosquito exerts an equal and opposite force on the windshield, called the reaction.

</ReferenceAnswers>
</Instance>

我尝试了这段代码,但是它在我的电脑上不起作用。 它返回空的数据框。

import pandas as pd 
import xml.etree.ElementTree as et 

xtree = et.parse("grade_data.xml")
xroot = xtree.getroot() 

df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", 'Question', 'Answer',
           'ContextRequired', 'ExtraInfoInAnswer', 'Comments', 'Watch', 'ReferenceAnswers']
rows = []


for node in xroot: 
    s_name = node.attrib.get("ID")
    s_student = node.find("StudentID") 
    s_task = node.find("TaskID") 
    s_source = node.find("DataSource") 
    s_desc = node.find("ProblemDescription") 
    s_question = node.find("Question") 
    s_ans = node.find("Answer") 
    s_label = node.find("Label") 
    s_contextrequired = node.find("ContextRequired") 
    s_extraInfoinAnswer = node.find("ExtraInfoInAnswer")
    s_comments = node.find("Comments") 
    s_watch = node.find("Watch") 
    s_referenceAnswers = node.find("ReferenceAnswers") 


    rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task, 
                 "DataSource": s_source, "ProblemDescription": s_desc , 
                 "Question": s_question , "Answer": s_ans ,"Label": s_label,
                 "s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
                 "Comments": s_comments ,  "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers, 

                })

out_df = pd.DataFrame(rows, columns = df_cols)

1
这个回答解决了你的问题吗?如何将XML文件转换为漂亮的Pandas数据框? - iacob
2个回答

2
你的解决方案中存在问题,即“元素数据提取”没有正确完成。你在问题描述中提到的XML文件嵌套了多层,因此我们需要递归地读取和提取数据。下面的解决方案应该可以满足你在这种情况下的需求。尽管我建议你查看这篇文章Python文档以获得更清晰的理解。
方法:1
import numpy as np
import pandas as pd
#import os
import xml.etree.ElementTree as ET

def xml2df(xml_source, df_cols, source_is_file = False, show_progress=True): 
    """Parse the input XML source and store the result in a pandas 
    DataFrame with the given columns. 

    For xml_source = xml_file, Set: source_is_file = True
    For xml_source = xml_string, Set: source_is_file = False

    <element attribute_key1=attribute_value1, attribute_key2=attribute_value2>
        <child1>Child 1 Text</child1>
        <child2>Child 2 Text</child2>
        <child3>Child 3 Text</child3>
    </element>
    Note that for an xml structure as shown above, the attribute information of 
    element tag can be accessed by list(element). Any text associated with <element> tag can be accessed
    as element.text and the name of the tag itself can be accessed with
    element.tag.
    """
    if source_is_file:
        xtree = ET.parse(xml_source) # xml_source = xml_file
        xroot = xtree.getroot()
    else:
        xroot = ET.fromstring(xml_source) # xml_source = xml_string
    consolidator_dict = dict()
    default_instance_dict = {label: None for label in df_cols}

    def get_children_info(children, instance_dict):
        # We avoid using element.getchildren() as it is deprecated.
        # Instead use list(element) to get a list of attributes.
        for child in children:
            #print(child)
            #print(child.tag)
            #print(child.items())
            #print(child.getchildren()) # deprecated method
            #print(list(child))
            if len(list(child))>0:
                instance_dict = get_children_info(list(child), 
                                                  instance_dict)

            if len(list(child.keys()))>0:
                items = child.items()
                instance_dict.update({key: value for (key, value) in items})             

            #print(child.keys())
            instance_dict.update({child.tag: child.text})
        return instance_dict

    # Loop over all instances
    for instance in list(xroot):
        instance_dict = default_instance_dict.copy()           
        ikey, ivalue = instance.items()[0] # The first attribute is "ID"
        instance_dict.update({ikey: ivalue}) 
        if show_progress:
            print('{}: {}={}'.format(instance.tag, ikey, ivalue))
        # Loop inside every instance
        instance_dict = get_children_info(list(instance), 
                                          instance_dict)   

        #consolidator_dict.update({ivalue: instance_dict.copy()}) 
        consolidator_dict[ivalue] = instance_dict.copy()       
    df = pd.DataFrame(consolidator_dict).T 
    df = df[df_cols]

    return df

运行以下命令以生成所需的输出。
xml_source = r'grade_data.xml'
df_cols = ["ID", "TaskID", "DataSource", "ProblemDescription", "Question", "Answer",
           "ContextRequired", "ExtraInfoInAnswer", "Comments", "Watch", 'ReferenceAnswers']

df = xml2df(xml_source, df_cols, source_is_file = True)
df

方法二:
给定一个 `xml_string`,你可以将其转换为 `xml >> dict >> dataframe`。运行以下代码以获得所需的输出。
注意:您需要安装 xmltodict 才能使用第二种方法。这个方法受到 @martin-blech 在 How to convert XML to JSON in Python? [duplicate]中提出的解决方案的启发。感谢 @martin-blech
pip install -U xmltodict

Solution

def read_recursively(x, instance_dict):  
    #print(x)
    txt = ''
    for key in x.keys():
        k = key.replace("@","")
        if k in df_cols: 
            if isinstance(x.get(key), dict):
                instance_dict, txt = read_recursively(x.get(key), instance_dict)
            #else:                
            instance_dict.update({k: x.get(key)})
            #print('{}: {}'.format(k, x.get(key)))
        else:
            #print('else: {}: {}'.format(k, x.get(key)))
            # dig deeper if value is another dict
            if isinstance(x.get(key), dict):
                instance_dict, txt = read_recursively(x.get(key), instance_dict)                
            # add simple text associated with element
            if k=='#text':
                txt = x.get(key)
        # update text to corresponding parent element    
        if (k!='#text') and (txt!=''):
            instance_dict.update({k: txt})
    return (instance_dict, txt)

你需要使用上面提供的函数read_recursively()。现在运行以下代码。
import xmltodict, json

o = xmltodict.parse(xml_string) # INPUT: XML_STRING
#print(json.dumps(o)) # uncomment to see xml to json converted string

consolidated_dict = dict()
oi = o['Instances']['Instance']

for x in oi:
    instance_dict = dict()
    instance_dict, _ = read_recursively(x, instance_dict)
    consolidated_dict.update({x.get("@ID"): instance_dict.copy()})
df = pd.DataFrame(consolidated_dict).T
df = df[df_cols]
df

那很令人印象深刻。谢谢。 - Hani Ihlayyle
@mzjn 感谢您指出我可以在解决方案中改进的两个问题。
  1. 我不知道 element.getchildren() 已经被弃用了。我会在解决方案中使用 list(element) 进行更新。
  2. 关于解析:是的,您说得对,xml 解析器已经解析了 DOM 并创建了树形结构;这就是为什么我们可以获取根元素和其他元素。我的意思是“元素数据提取”没有做好。@HaniIhlayyle 您在问题中提到的 xml 嵌套在几层中。这就是为什么我们需要递归读取和提取数据的原因。
- CypherX

1

几个问题:

  • 在循环变量node上调用.find,期望存在一个子节点:current_node.find('child_of_current_node')。然而,由于所有节点都是根节点的子节点,它们不维护自己的子节点,因此不需要循环;
  • 未检查find()可能导致缺少节点而产生的NoneType,并防止检索.tag.text或其他属性;
  • 未使用.text检索节点内容,否则将返回<Element...对象;

考虑使用三目运算表达式a if condition else b进行调整,以确保变量具有值:

rows = []

s_name = xroot.attrib.get("ID")
s_student = xroot.find("StudentID").text if xroot.find("StudentID") is not None else None
s_task = xroot.find("TaskID").text if xroot.find("TaskID") is not None else None      
s_source = xroot.find("DataSource").text if xroot.find("DataSource") is not None else None
s_desc = xroot.find("ProblemDescription").text if xroot.find("ProblemDescription") is not None else None
s_question = xroot.find("Question").text if xroot.find("Question") is not None else None    
s_ans = xroot.find("Answer").text if xroot.find("Answer") is not None else None
s_label = xroot.find("Label").text if xroot.find("Label") is not None else None
s_contextrequired = xroot.find("ContextRequired").text if xroot.find("ContextRequired") is not None else None
s_extraInfoinAnswer = xroot.find("ExtraInfoInAnswer").text if xroot.find("ExtraInfoInAnswer") is not None else None
s_comments = xroot.find("Comments").text if xroot.find("Comments") is not None else None
s_watch = xroot.find("Watch").text if xroot.find("Watch") is not None else None
s_referenceAnswers = xroot.find("ReferenceAnswers").text if xroot.find("ReferenceAnswers") is not None else None

rows.append({"ID": s_name,"StudentID":s_student, "TaskID": s_task, 
             "DataSource": s_source, "ProblemDescription": s_desc , 
             "Question": s_question , "Answer": s_ans ,"Label": s_label,
             "s_contextrequired": s_contextrequired , "ExtraInfoInAnswer": s_extraInfoinAnswer ,
             "Comments": s_comments ,  "Watch": s_watch, "ReferenceAnswers": s_referenceAnswers     
            })

out_df = pd.DataFrame(rows, columns = df_cols)

或者,运行一个更动态的版本,使用迭代变量分配到内部字典中:

rows = []
for node in xroot: 
    inner = {}
    inner[node.tag] = node.text

    rows.append(inner)

out_df = pd.DataFrame(rows, columns = df_cols)

或者列表/字典推导式:

rows = [{node.tag: node.text} for node in xroot]
out_df = pd.DataFrame(rows, columns = df_cols)

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接