如何使用Python解析复杂的文本文件?

32

我正在寻找一种将复杂文本文件解析成pandas DataFrame的简单方法。以下是一个示例文件,我想在解析后得到的结果,以及我的当前方法。

有没有办法使它更加简洁/更快/更pythonic/更易读?

我还在Code Review上发布了这个问题。

最终,我写了一篇博客文章来向初学者解释这个问题

这里是一个示例文件:

Sample text

A selection of students from Riverdale High and Hogwarts took part in a quiz. This is a record of their scores.

School = Riverdale High
Grade = 1
Student number, Name
0, Phoebe
1, Rachel

Student number, Score
0, 3
1, 7

Grade = 2
Student number, Name
0, Angela
1, Tristan
2, Aurora

Student number, Score
0, 6
1, 3
2, 9

School = Hogwarts
Grade = 1
Student number, Name
0, Ginny
1, Luna

Student number, Score
0, 8
1, 7

Grade = 2
Student number, Name
0, Harry
1, Hermione

Student number, Score
0, 5
1, 10

Grade = 3
Student number, Name
0, Fred
1, George

Student number, Score
0, 0
1, 0

这是我希望解析后的结果:

                                         Name  Score
School         Grade Student number                 
Hogwarts       1     0                  Ginny      8
                     1                   Luna      7
               2     0                  Harry      5
                     1               Hermione     10
               3     0                   Fred      0
                     1                 George      0
Riverdale High 1     0                 Phoebe      3
                     1                 Rachel      7
               2     0                 Angela      6
                     1                Tristan      3
                     2                 Aurora      9

这是我目前的解析方式:

import re
import pandas as pd


def parse(filepath):
    """
    Parse text at given filepath

    Parameters
    ----------
    filepath : str
        Filepath for file to be parsed

    Returns
    -------
    data : pd.DataFrame
        Parsed data

    """

    data = []
    with open(filepath, 'r') as file:
        line = file.readline()
        while line:
            reg_match = _RegExLib(line)

            if reg_match.school:
                school = reg_match.school.group(1)

            if reg_match.grade:
                grade = reg_match.grade.group(1)
                grade = int(grade)

            if reg_match.name_score:
                value_type = reg_match.name_score.group(1)
                line = file.readline()
                while line.strip():
                    number, value = line.strip().split(',')
                    value = value.strip()
                    dict_of_data = {
                        'School': school,
                        'Grade': grade,
                        'Student number': number,
                        value_type: value
                    }
                    data.append(dict_of_data)
                    line = file.readline()

            line = file.readline()

        data = pd.DataFrame(data)
        data.set_index(['School', 'Grade', 'Student number'], inplace=True)
        # consolidate df to remove nans
        data = data.groupby(level=data.index.names).first()
        # upgrade Score from float to integer
        data = data.apply(pd.to_numeric, errors='ignore')
    return data


class _RegExLib:
    """Set up regular expressions"""
    # use https://regexper.com to visualise these if required
    _reg_school = re.compile('School = (.*)\n')
    _reg_grade = re.compile('Grade = (.*)\n')
    _reg_name_score = re.compile('(Name|Score)')

    def __init__(self, line):
        # check whether line has a positive match with all of the regular expressions
        self.school = self._reg_school.match(line)
        self.grade = self._reg_grade.match(line)
        self.name_score = self._reg_name_score.search(line)


if __name__ == '__main__':
    filepath = 'sample.txt'
    data = parse(filepath)
    print(data)

3
在处理文本时,可以按照学习曲线逐步采用以下三种方法:使用 str 方法;使用 re/regex 模块;使用解析库(例如 parsimonious、PLY、pyparsing等)。请注意,这些方法的复杂度是逐步增加的。 - pylang
5个回答

39

2019年更新(PEG解析器):

这个答案得到了相当多的关注,所以我感觉需要添加另一种可能性,即解析选项。在这里,我们可以使用PEG解析器(例如parsimonious),与NodeVisitor类结合使用:

from parsimonious.grammar import Grammar
from parsimonious.nodes import NodeVisitor
import pandas as pd
grammar = Grammar(
    r"""
    schools         = (school_block / ws)+

    school_block    = school_header ws grade_block+ 
    grade_block     = grade_header ws name_header ws (number_name)+ ws score_header ws (number_score)+ ws? 

    school_header   = ~"^School = (.*)"m
    grade_header    = ~"^Grade = (\d+)"m
    name_header     = "Student number, Name"
    score_header    = "Student number, Score"

    number_name     = index comma name ws
    number_score    = index comma score ws

    comma           = ws? "," ws?

    index           = number+
    score           = number+

    number          = ~"\d+"
    name            = ~"[A-Z]\w+"
    ws              = ~"\s*"
    """
)

tree = grammar.parse(data)

class SchoolVisitor(NodeVisitor):
    output, names = ([], [])
    current_school, current_grade = None, None

    def _getName(self, idx):
        for index, name in self.names:
            if index == idx:
                return name

    def generic_visit(self, node, visited_children):
        return node.text or visited_children

    def visit_school_header(self, node, children):
        self.current_school = node.match.group(1)

    def visit_grade_header(self, node, children):
        self.current_grade = node.match.group(1)
        self.names = []

    def visit_number_name(self, node, children):
        index, name = None, None
        for child in node.children:
            if child.expr.name == 'name':
                name = child.text
            elif child.expr.name == 'index':
                index = child.text

        self.names.append((index, name))

    def visit_number_score(self, node, children):
        index, score = None, None
        for child in node.children:
            if child.expr.name == 'index':
                index = child.text
            elif child.expr.name == 'score':
                score = child.text

        name = self._getName(index)

        # build the entire entry
        entry = (self.current_school, self.current_grade, index, name, score)
        self.output.append(entry)

sv = SchoolVisitor()
sv.visit(tree)

df = pd.DataFrame.from_records(sv.output, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)

正则表达式选项(译文)

那么,在第x次观看《指环王》时,我必须花些时间来到达最后:


具体来说,将问题分解成几个较小的问题:

  1. 分离每个学校
  2. ... 每个年级
  3. ... 学生和成绩
  4. ... 最后将它们绑定在一起形成数据框架


学校部分(请参见regex101.com上的演示

^
School\s*=\s*(?P<school_name>.+)
(?P<school_content>[\s\S]+?)
(?=^School|\Z)


成绩部分 (在regex101.com上的另一个演示)

^
Grade\s*=\s*(?P<grade>.+)
(?P<students>[\s\S]+?)
(?=^Grade|\Z)


学生/成绩部分(regex101.com上的最新演示):

^
Student\ number,\ Name[\n\r]
(?P<student_names>(?:^\d+.+[\n\r])+)
\s*
^
Student\ number,\ Score[\n\r]
(?P<student_scores>(?:^\d+.+[\n\r])+)

剩下的是一个生成器表达式,然后将其送入 DataFrame 构造函数(连同列名)。
import pandas as pd, re

rx_school = re.compile(r'''
    ^
    School\s*=\s*(?P<school_name>.+)
    (?P<school_content>[\s\S]+?)
    (?=^School|\Z)
''', re.MULTILINE | re.VERBOSE)

rx_grade = re.compile(r'''
    ^
    Grade\s*=\s*(?P<grade>.+)
    (?P<students>[\s\S]+?)
    (?=^Grade|\Z)
''', re.MULTILINE | re.VERBOSE)

rx_student_score = re.compile(r'''
    ^
    Student\ number,\ Name[\n\r]
    (?P<student_names>(?:^\d+.+[\n\r])+)
    \s*
    ^
    Student\ number,\ Score[\n\r]
    (?P<student_scores>(?:^\d+.+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)


result = ((school.group('school_name'), grade.group('grade'), student_number, name, score)
    for school in rx_school.finditer(string)
    for grade in rx_grade.finditer(school.group('school_content'))
    for student_score in rx_student_score.finditer(grade.group('students'))
    for student in zip(student_score.group('student_names')[:-1].split("\n"), student_score.group('student_scores')[:-1].split("\n"))
    for student_number in [student[0].split(", ")[0]]
    for name in [student[0].split(", ")[1]]
    for score in [student[1].split(", ")[1]]
)

df = pd.DataFrame(result, columns = ['School', 'Grade', 'Student number', 'Name', 'Score'])
print(df)


Condensed:

rx_school = re.compile(r'^School\s*=\s*(?P<school_name>.+)(?P<school_content>[\s\S]+?)(?=^School|\Z)', re.MULTILINE)
rx_grade = re.compile(r'^Grade\s*=\s*(?P<grade>.+)(?P<students>[\s\S]+?)(?=^Grade|\Z)', re.MULTILINE)
rx_student_score = re.compile(r'^Student number, Name[\n\r](?P<student_names>(?:^\d+.+[\n\r])+)\s*^Student number, Score[\n\r](?P<student_scores>(?:^\d+.+[\n\r])+)', re.MULTILINE)

这将产生:
            School Grade Student number      Name Score
0   Riverdale High     1              0    Phoebe     3
1   Riverdale High     1              1    Rachel     7
2   Riverdale High     2              0    Angela     6
3   Riverdale High     2              1   Tristan     3
4   Riverdale High     2              2    Aurora     9
5         Hogwarts     1              0     Ginny     8
6         Hogwarts     1              1      Luna     7
7         Hogwarts     2              0     Harry     5
8         Hogwarts     2              1  Hermione    10
9         Hogwarts     3              0      Fred     0
10        Hogwarts     3              1    George     0


关于时间,这是运行它一万次的结果:

import timeit
print(timeit.timeit(makedf, number=10**4))
# 11.918397722000009 s


3
哇哦!这太棒了。希望有一天我也能像这样自己写出代码。然而,我问问题的原因是想找一个易于理解的方法来解析文本文件,以便我可以教给完全的初学者。我认为你的代码非常简明扼要,但可能不是初学者容易编写的东西。不过还是谢谢你分享!我会研究这个代码以加深我的理解。 :) - bluprince13
1
@bluprince13:不,那绝对不是教给初学者的东西 :) - Jan
2
@bluprince13:你真的可以考虑一下 https://codereview.stackexchange.com/。 - Jan
2
@bluprince13:与你的相比,这段代码更简单、易于扩展和维护。它并不是为了复杂而复杂。虽然我同意这不是一个好的起点,但我想Jan一开始也是从简单的模型开始的,而你的已经(必然)很复杂了。作为可能性的一个例子,这很出色,但我相信CodeReview的伙伴们可以帮助你进一步完善。 - Jongware
2
好的答案。 :) - Wiktor Stribiżew

8

这里是我的建议,使用split和pd.concat(“txt”代表问题中原始文本的副本),基本上的想法是按组词分割,然后将它们合并成数据框,最内部的解析利用了名称和成绩类似于CSV格式的事实。具体如下:

import pandas as pd
from io import StringIO

schools = txt.lower().split('school = ')
schools_dfs = []
for school in schools[1:]:
    grades = school.split('grade = ') 
    grades_dfs = []
    for grade in grades[1:]:
        features = grade.split('student number,')
        feature_dfs = []
        for feature in features[1:]:
            feature_dfs.append(pd.read_csv(StringIO(feature)))
        feature_df = pd.concat(feature_dfs, axis=1)
        feature_df['grade'] = features[0].replace('\n','')
        grades_dfs.append(feature_df)
    grades_df = pd.concat(grades_dfs)
    grades_df['school'] = grades[0].replace('\n','')
    schools_dfs.append(grades_df)
schools_df = pd.concat(schools_dfs)

schools_df.set_index(['school', 'grade'])

enter image description here


哇,非常不寻常,但是对于成功使用pandas给你加一分。 - pylang

7
我建议使用解析组合库,比如parsy。与使用正则表达式相比,结果可能不会那么简洁,但它将更加易读且健壮,并且仍然相对轻量级。
总的来说,解析是一项相当困难的任务,适用于初学者的方法可能很难找到。
2022年编辑: 完整示例代码,使用现代Parsy解析您提供的示例并生成相同的输出。
它分为3个阶段:
- 将其解析成一些基本数据结构 - 稍微转换以匹配学生姓名/编号/分数 - 转换为DataFrame 这种分离意味着在DataFrame级别需要较少的hack。
import pandas as pd
from parsy import string, regex, seq
from dataclasses import dataclass


@dataclass
class Student:
    name: str
    number: int


@dataclass
class Score:
    score: int
    number: int


@dataclass
class StudentWithScore:
    name: str
    number: int
    score: int


@dataclass
class Grade:
    grade: int
    students: list[Student]
    scores: list[Score]

    @property
    def students_with_scores(self) -> list[StudentWithScore]:
        names = {st.number: st.name for st in self.students}
        return [StudentWithScore(names[score.number], score.number, score.score) for score in self.scores]


@dataclass
class School:
    name: str
    grades: list[Grade]


integer = regex(r"\d+").map(int)
student_number = integer
score = integer
student_name = regex(r"[^\n]+")
student_def = seq(
    number=student_number << string(", "),
    name=student_name << string("\n"),
).combine_dict(Student)

student_def_list = string("Student number, Name\n") >> student_def.many()
score_def = seq(
    number=student_number << string(", "),
    score=score << string("\n"),
).combine_dict(Score)
score_def_list = string("Student number, Score\n") >> score_def.many()
grade_value = integer
grade_def = string("Grade = ") >> grade_value << string("\n")
school_grade = seq(
    grade=grade_def,
    students=student_def_list << regex(r"\n*"),
    scores=score_def_list << regex(r"\n*"),
).combine_dict(Grade)

school_name = regex(r"[^\n]+")
school_def = string("School = ") >> school_name << string("\n")
school = seq(
    name=school_def,
    grades=school_grade.many(),
).combine_dict(School)


def parse(text: str) -> list[School]:
    return school.many().parse(text)


def schools_to_dataframe(schools: list[School]) -> pd.DataFrame:
    data_dicts = [
        {"School": school.name, "Grade": g.grade, "Student number": s.number, "Name": s.name, "Score": s.score}
        for school in schools
        for g in school.grades
        for s in g.students_with_scores
    ]
    data = pd.DataFrame(data_dicts)
    data.set_index(["School", "Grade", "Student number"], inplace=True)
    return data


if __name__ == "__main__":
    filepath = "sample.txt"
    text = open(filepath).read()
    start = text.index("School =")
    schools = parse(text[start:])
    data = schools_to_dataframe(schools)
    print(data)

没听说过这个。可以加到 https://github.com/vinta/awesome-python 吗? - Bill Bell
@BillBell 感谢您的建议 - PR - https://github.com/vinta/awesome-python/pull/993 - spookylukey

3
与原始代码类似,我定义了解析正则表达式的方式。
import re
import pandas as pd

parse_re = {
    'school': re.compile(r'School = (?P<school>.*)$'),
    'grade': re.compile(r'Grade = (?P<grade>\d+)'),
    'student': re.compile(r'Student number, (?P<info>\w+)'),
    'data': re.compile(r'(?P<number>\d+), (?P<value>.*)$'),
}

def parse(line):
    '''parse the line by regex search against possible line formats
       returning the id and match result of first matching regex,
       or None if no match is found'''
    return reduce(lambda (i,m),(id,rx): (i,m) if m else (id, rx.search(line)), 
                  parse_re.items(), (None,None))

然后遍历每行信息,收集每个学生的信息。当记录完整时(我们有分数时),将记录添加到列表中。

一个小状态机由逐行正则表达式匹配驱动,汇总每个记录。特别的,我们必须按数字保存每个年级的学生,因为他们的分数和姓名是分开提供在输入文件中的。

results = []
with open('sample.txt') as f:
    record = {}
    for line in f:
        id, match = parse(line)

        if match is None:
            continue

        if id == 'school':
            record['School'] = match.group('school')
        elif id == 'grade':
            record['Grade'] = int(match.group('grade'))
            names = {}  # names is a number indexed dictionary of student names
        elif id == 'student':
            info = match.group('info')
        elif id == 'data':
            number = int(match.group('number'))
            value = match.group('value')
            if info == 'Name':
                names[number] = value
            elif info == 'Score':
                record['Student number'] = number
                record['Name'] = names[number]
                record['Score'] = int(value)
                results.append(record.copy())

最后,记录列表被转换为一个DataFrame
df = pd.DataFrame(results, columns=['School', 'Grade', 'Student number', 'Name', 'Score'])
print df

输出:

            School  Grade  Student number      Name  Score
0   Riverdale High      1               0    Phoebe      3
1   Riverdale High      1               1    Rachel      7
2   Riverdale High      2               0    Angela      6
3   Riverdale High      2               1   Tristan      3
4   Riverdale High      2               2    Aurora      9
5         Hogwarts      1               0     Ginny      8
6         Hogwarts      1               1      Luna      7
7         Hogwarts      2               0     Harry      5
8         Hogwarts      2               1  Hermione     10
9         Hogwarts      3               0      Fred      0
10        Hogwarts      3               1    George      0

一些优化的方法是首先比较最常见的正则表达式,明确跳过空行。边遍历数据时构建数据框可以避免多余的数据复制,但是我了解到向数据框追加数据是一项昂贵的操作。


这真的很好,谢谢。我喜欢你使用函数而不是类来进行正则表达式匹配。 - bluprince13
@blueprince13 在实践中,这些函数可能最终会成为类中的方法。任何具有状态的内容都应该被放在对象中,以便同时拥有多个实例。 - Mike Robins

0

这正是Pawpaw设计的问题类型。

Pawpaw是一个高性能的解析和文本分割框架,可以让您快速轻松地构建复杂的管道解析器。段落会自动组织成树状图,可以使用强大的结构化查询语言进行序列化、遍历和搜索。

这里有两种不同的基于Pawpaw的方法来解决这个问题。我复制了最紧凑的版本,并添加了一些树形可视化:

代码

import sys
import os.path

import regex
from pawpaw import arborform, Ito, visualization
import pandas as pd

def get_parser() -> arborform.Itorator:
    return arborform.Extract(
        regex.compile(
            r'(?<school>School = (?<name>.+?)\n'
            r'(?<grade>Grade = (?<key>\d+)\n'
            r'Student number, Name\n(?P<stu_num_names>(?:(?P<stu_num>\d+), (?P<name>.+?)\n)+)\n'
            r'Student number, Score\n(?P<stu_num_scores>(?:(?P<stu_num>\d+), (?P<score>\d+)(?:$|\n))+)(?:$|\n)'
            r')+)+',
            regex.DOTALL
    )
)

# read file
with open(os.path.join(sys.path[0], 'input.txt')) as f:
    ito = Ito(f.read(), desc='all')

# parse
parser = get_parser()
ito.children.add(*parser(ito))

# display Pawpaw tree
tree_vis = visualization.pepo.Tree()
print(tree_vis.dumps(ito))

# build pandas DataFrame
d = []
for school in ito.find_all('*[d:school]'):
    school_name = str(school.find('*[d:name]'))
    for grade in school.find_all('**[d:grade]'):
        grade_key = int(str(grade.find('*[d:key]')))
        for stu_num in grade.find_all('*[d:stu_num_names]/*[d:stu_num]'):
            stu_name = str(stu_num.find('>[d:name]'))
            stu_num = str(stu_num)
            stu_score = int(str(grade.find('*[d:stu_num_scores]/*[d:stu_num]&[s:' + stu_num + ']/>[d:score]')))
            d.append({'School': school_name, 'Grade': grade_key, 'Student number': stu_num, 'Name': stu_name, 'Score': stu_score})
data = pd.DataFrame(d)
data.set_index(['School', 'Grade', 'Student number'], inplace=True)
data = data.groupby(level=data.index.names).first()

# display pandas DataFrame
print(data)

输出

(0, 478) 'all' : 'School = Riverdale H…, Score\n0, 0\n1, 0'
├──(0, 210) 'school' : 'School = Riverdale H…, 6\n1, 3\n2, 9\n\n'
│  ├──(9, 23) 'name' : 'Riverdale High'
│  ├──(24, 109) 'grade' : 'Grade = 1\nStudent n…ore\n0, 3\n1, 7\n\n'
│  │  ├──(32, 33) 'key' : '1'
│  │  ├──(55, 75) 'stu_num_names' : '0, Phoebe\n1, Rachel\n'
│  │  │  ├──(55, 56) 'stu_num' : '0'
│  │  │  ├──(58, 64) 'name' : 'Phoebe'
│  │  │  ├──(65, 66) 'stu_num' : '1'
│  │  │  └──(68, 74) 'name' : 'Rachel'
│  │  └──(98, 108) 'stu_num_scores' : '0, 3\n1, 7\n'
│  │     ├──(98, 99) 'stu_num' : '0'
│  │     ├──(101, 102) 'score' : '3'
│  │     ├──(103, 104) 'stu_num' : '1'
│  │     └──(106, 107) 'score' : '7'
│  └──(109, 210) 'grade' : 'Grade = 2\nStudent n…, 6\n1, 3\n2, 9\n\n'
│     ├──(117, 118) 'key' : '2'
│     ├──(140, 171) 'stu_num_names' : '0, Angela\n1, Tristan\n2, Aurora\n'
│     │  ├──(140, 141) 'stu_num' : '0'
│     │  ├──(143, 149) 'name' : 'Angela'
│     │  ├──(150, 151) 'stu_num' : '1'
│     │  ├──(153, 160) 'name' : 'Tristan'
│     │  ├──(161, 162) 'stu_num' : '2'
│     │  └──(164, 170) 'name' : 'Aurora'
│     └──(194, 209) 'stu_num_scores' : '0, 6\n1, 3\n2, 9\n'
│        ├──(194, 195) 'stu_num' : '0'
│        ├──(197, 198) 'score' : '6'
│        ├──(199, 200) 'stu_num' : '1'
│        ├──(202, 203) 'score' : '3'
│        ├──(204, 205) 'stu_num' : '2'
│        └──(207, 208) 'score' : '9'
└──(210, 478) 'school' : 'School = Hogwarts\nG…, Score\n0, 0\n1, 0'
   ├──(219, 227) 'name' : 'Hogwarts'
   ├──(228, 310) 'grade' : 'Grade = 1\nStudent n…ore\n0, 8\n1, 7\n\n'
   │  ├──(236, 237) 'key' : '1'
   │  ├──(259, 276) 'stu_num_names' : '0, Ginny\n1, Luna\n'
   │  │  ├──(259, 260) 'stu_num' : '0'
   │  │  ├──(262, 267) 'name' : 'Ginny'
   │  │  ├──(268, 269) 'stu_num' : '1'
   │  │  └──(271, 275) 'name' : 'Luna'
   │  └──(299, 309) 'stu_num_scores' : '0, 8\n1, 7\n'
   │     ├──(299, 300) 'stu_num' : '0'
   │     ├──(302, 303) 'score' : '8'
   │     ├──(304, 305) 'stu_num' : '1'
   │     └──(307, 308) 'score' : '7'
   ├──(310, 397) 'grade' : 'Grade = 2\nStudent n…re\n0, 5\n1, 10\n\n'
   │  ├──(318, 319) 'key' : '2'
   │  ├──(341, 362) 'stu_num_names' : '0, Harry\n1, Hermione\n'
   │  │  ├──(341, 342) 'stu_num' : '0'
   │  │  ├──(344, 349) 'name' : 'Harry'
   │  │  ├──(350, 351) 'stu_num' : '1'
   │  │  └──(353, 361) 'name' : 'Hermione'
   │  └──(385, 396) 'stu_num_scores' : '0, 5\n1, 10\n'
   │     ├──(385, 386) 'stu_num' : '0'
   │     ├──(388, 389) 'score' : '5'
   │     ├──(390, 391) 'stu_num' : '1'
   │     └──(393, 395) 'score' : '10'
   └──(397, 478) 'grade' : 'Grade = 3\nStudent n…, Score\n0, 0\n1, 0'
      ├──(405, 406) 'key' : '3'
      ├──(428, 446) 'stu_num_names' : '0, Fred\n1, George\n'
      │  ├──(428, 429) 'stu_num' : '0'
      │  ├──(431, 435) 'name' : 'Fred'
      │  ├──(436, 437) 'stu_num' : '1'
      │  └──(439, 445) 'name' : 'George'
      └──(469, 478) 'stu_num_scores' : '0, 0\n1, 0'
         ├──(469, 470) 'stu_num' : '0'
         ├──(472, 473) 'score' : '0'
         ├──(474, 475) 'stu_num' : '1'
         └──(477, 478) 'score' : '0'

                                         Name  Score
School         Grade Student number                 
Hogwarts       1     0                  Ginny      8
                     1                   Luna      7
               2     0                  Harry      5
                     1               Hermione     10
               3     0                   Fred      0
                     1                 George      0
Riverdale High 1     0                 Phoebe      3
                     1                 Rachel      7
               2     0                 Angela      6
                     1                Tristan      3
                     2                 Aurora      9

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接