使用awk(或可能是Python)根据多个字段比较两个文件

4

我想比较两个文件,并显示两个文件中的差异和缺失记录。

根据这个论坛上的建议,我发现awk是最快的解决方案。

基于组合键 - match_key 和 issuer_grid_id 进行比较。

代码:

BEGIN { FS="[= ]" }
{
    match(" "$0,/ match_key="[^"]+"/)
    key = substr($0,RSTART,RLENGTH)
}
NR==FNR {
    file1[key] = $0
    next
}
{
    if ( key in file1 ) {
        nf = split(file1[key],tmp)
        for (i=1; i<nf; i+=2) {
            f1[key,tmp[i]] = tmp[i+1]
        }

        msg = sep = ""
        for (i=1; i<NF; i+=2) {
            if ( $(i+1) != f1[key,$i] ) {
                msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
                sep = ","
            }
        }
        if ( msg != "" ) {
            print "Mismatch in row " FNR msg
        }
        delete file1[key]
    }
    else {
        file2[key] = $0
    }
}
END {
    for (key in file1) {
        print "In file1 only:", key, file1[key]
    }
    for (key in file2) {
        print "In file2 only:", key, file2[key]
    }
}

文件1:

period="2021-02-28" book_base_ent_cd="U0028" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="USD" issuer_grid_id="2" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="3" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA22"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"

文件2:

period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="3" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA23"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"

文件3(它只有一行,但字段数量更多)
period="2021-02-28" book_base_ent_cd="U0027" other_inst_ident="PLCHS258Q463" rep_nom_curr="PLN" reporting_basis="Unit" src_instr_class="Debt" mat_date="2026-08-25" nom_curr="PLN" primary_asset_class="Bond" seniority_type="931" security_status="alive" issuer_name="CUST38677608" intra_group_prud_scope="Issuer is not part of the reporting group" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_frbrnc_stts="NOFRBRNRNGT" prfrmng_stts="Performing" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" src_imprmnt_assssmnt_mthd="COLLECTIVE" accmltd_imprmnt="78.54" accmltd_chngs_fv_cr="0" expsr_vl="0" unit_measure="EUR" unit_measure_nv="EUR" crryng_amnt="24565.13" issuer_grid_id="38677608" match_key="PLCHS258Q463"

期望输出:

In file1 only : issuer_grid_id="2" match_key="PLCHS252SA22"
In file2 only : issuer_grid_id="2" match_key="PLCHS252SA23"

Mismatch for issuer_grid_id="2" match_key="PLCHS252SA20" : file1.book_base_ent_cd="U0028" file2.book_base_ent_cd="U0027", file1.unit_measure="USD" file2.unit_measure="EUR" 

实际输出

awk -f compare.awk file1 file2
Mismatch in row 1 for    file1.issuer_grid_id="2" file2.issuer_grid_id="3", file1.match_key="PLCHS252SA21" file2.match_key="PLCHS252SA20"
In file2 only:  period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"

我找不到进行多字段比较的方法?

如有任何建议,将不胜感激。我还标记了Python,如果有更快的方法,请告知。

最好的问候。

2个回答

2

只需在顶部调整key的设置,即可使用任何一组字段,并将不匹配消息的打印方式更改为from key ... key而不是from line ... FNR:

$ cat tst.awk
BEGIN { FS="[= ]" }
{
    match(" "$0,/ issuer_grid_id="[^"]+"/)
    key = substr($0,RSTART,RLENGTH)
    match(" "$0,/ match_key="[^"]+"/)
    key = key substr($0,RSTART,RLENGTH)
}
NR==FNR {
    file1[key] = $0
    next
}
{
    if ( key in file1 ) {
        nf = split(file1[key],tmp)
        for (i=1; i<nf; i+=2) {
            f1[key,tmp[i]] = tmp[i+1]
        }

        msg = sep = ""
        for (i=1; i<NF; i+=2) {
            if ( $(i+1) != f1[key,$i] ) {
                msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
                sep = ","
            }
        }
        if ( msg != "" ) {
            print "Mismatch for key " key msg
        }
        delete file1[key]
    }
    else {
        file2[key] = $0
    }
}
END {
    for (key in file1) {
        print "In file1 only:", key, file1[key]
    }
    for (key in file2) {
        print "In file2 only:", key, file2[key]
    }
}

$ awk -f tst.awk file1 file2
Mismatch for key issuer_grid_id="2" match_key="PLCHS252SA20" file1.book_base_ent_cd="U0028" file2.book_base_ent_cd="U0027", file1.unit_measure="USD" file2.unit_measure="EUR"
In file1 only: issuer_grid_id="2" match_key="PLCHS252SA22" period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA22"
In file2 only: issuer_grid_id="2" match_key="PLCHS252SA23" period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA23"


谢谢Ed。如果文件中一行的字段数量增加,会有影响吗?我刚刚更改了脚本的输入(可以在文件3中看到)。我将文件3的内容复制到文件1和文件2中。因此,现在两个文件是相同的,unix diff命令也证实了这一点。但是awk比较显示有差异。你能否请你尝试一下?奇怪的是它会将一个值作为输出中的字段显示出来。我认为我们需要在脚本中处理输入字段值中的空格。 - Neha
字段数量不重要,但现在您正在将空格引入引号字段中,并且我们在代码中使用空格和=作为字段分隔符。所以正如我在第一篇答案的开头所说的那样,假设您提供的示例中的引用字符串不包含空格、=或",即代码并不设计来处理这些内容。请就如何处理带引号字段中的空格和/或=和/或"提出一个新问题,而不是关于如何处理多字段键的问题。 - Ed Morton
当然,我会问另一个问题。但是为了我的理解,这个输入与原始输入有什么不同?在其中,我们也有像intra_group_acc_scope、frbrnc_stts、dflt_stts_issr等带有引号字符串内部空格的字段。 - Neha
如果“原始输入”是指您之前的问题,那么代码所处理的问题中没有带有空格的字段。如果您指的是此问题中的输入,则您的输入太长了,无法注意到这样的更改,您需要提前通知以进行查找。良好的示例输入/输出简洁明了,不需要滚动条即可查看,正因为如此。因此,您在此问题中的file3输入与file1和file2输入并没有区别,但您的问题是关于如何修改现有代码以处理多字段键,而不是如何处理带有空格的字段。 - Ed Morton

0

你可以使用 Ruby 集合:

$ cat tst.rb
def f2h(fn) 
    data={}
    File.open(fn){|fh| fh.
        each_line{|line| h=line.scan(/(\w+)="([^"]+)"/).to_h
            k=h.slice("issuer_grid_id", "match_key").
            map{|k,v| "#{k}=#{v}"}.join(", ")
            data[k]=h}
    }
    data
end

f1=f2h(ARGV[0])
f2=f2h(ARGV[1])

mis=Hash.new { |hash, key| hash[key] = [] }
(f2.keys & f1.keys).each{|k| 
    f1[k].each{|ks,v| 
        template="#{ks}: #{ARGV[0]}.#{f1[k][ks]}, #{ARGV[1]}.#{f2[k][ks]}"
        mis[k] << template if f1[k][ks]!=f2[k][ks]}} 

mis.each{|k,v| puts "Mismatch for key #{k} #{v.join(" ")}"}

f1only=(f1.keys-f2.keys).join(", ") 
f2only=(f2.keys-f1.keys).join(", ")  
puts "Only in #{ARGV[0]}: #{f1only}\nOnly in #{ARGV[1]}: #{f2only}"

然后这样调用:

ruby tst.rb  file1 file2

输出:

Mismatch for key issuer_grid_id=2, match_key=PLCHS252SA20 book_base_ent_cd: file1.U0028, file2.U0027 unit_measure: file1.USD, file2.EUR
Only in file1: issuer_grid_id=2, match_key=PLCHS252SA22
Only in file2: issuer_grid_id=2, match_key=PLCHS252SA23

(如果您想在值周围添加引号,这很容易实现。)

它之所以有效,是因为Ruby支持对数组进行集合运算(这是来自Ruby交互式shell的内容):

irb(main):033:0> arr1=[1,2,3,4]
=> [1, 2, 3, 4]
irb(main):034:0> arr2=[2,3,4,5]
=> [2, 3, 4, 5]
irb(main):035:0> arr1-arr2
=> [1]       # only in arr1
irb(main):036:0> arr2-arr1
=> [5]       # only in arr2
irb(main):037:0> arr1 & arr2
=> [2, 3, 4] # common between arr1 and arr2

因为我们使用了(f2.keys & f1.keys),我们保证只循环共享的键。所以它可以很好地处理你的例子file3

$ ruby tst.rb  file1 file3
Only in file1: issuer_grid_id=2, match_key=PLCHS252SA20, issuer_grid_id=3, match_key=PLCHS252SA20, issuer_grid_id=2, match_key=PLCHS252SA22, issuer_grid_id=2, match_key=PLCHS252SA21
Only in file3: issuer_grid_id=38677608, match_key=PLCHS258Q463

由于Python也有集合,因此这在Python中也很容易编写:

import re 

def f2h(fn):
    di={}
    k1, k2="issuer_grid_id", "match_key"
    with open(fn) as f:
        for line in f:
            matches=dict(re.findall(r'(\w+)="([^"]+)"', line))
            di[f"{k1}={matches[k1]}, {k2}={matches[k2]}"]=matches
    return di

f1=f2h(fn1)
f2=f2h(fn2)

mis={}
for k in set(f1.keys()) & set(f2.keys()):
    for ks,v in f1[k].items():
        if f1[k][ks]!=f2[k][ks]:
            mis.setdefault(k, []).append(
               f"{ks}: {fn1}.{f1[k][ks]}, {fn2}.{f2[k][ks]}")
            
for k,v in mis.items():
    print(f"Mismatch for key {k} {' '.join(v)}")
    
print(f"Only in {fn1}: {';'.join(set(f1.keys())-f2.keys())}")
print(f"Only in {fn2}: {';'.join(set(f2.keys())-f1.keys())}")

虽然 awk 不支持集合,但是使用关联数组可以轻松编写集合操作andminus。从而允许使用 GNU awk 版本的相同方法:
function set_and(a1, a2, a3) {
    delete a3
    for (e in a1) if (e in a2) a3[e]
}

function set_minus(a1, a2, a3) {
    delete a3
    for (e in a1) if (!(e in a2)) a3[e]
}

function proc_line(s, data) {
    delete data
    # this is the only GNU specific portion and easily rewritten for POSIX
    patsplit(s,matches,/\w+="[^"]+"/)
    for (m in matches) {
        split(matches[m],kv, /=/)
        data[kv[1]]=kv[2]
    }   
}

{
    proc_line($0, data) 
    key=sprintf("issuer_grid_id=%s, match_key=%s", 
        data["issuer_grid_id"], data["match_key"])
}

FNR==NR{a1[key]=$0}
FNR<NR{a2[key]=$0}

END{
    set_and(a1,a2, a3) 
    for (key in a3) {
        ft=sprintf("Mismatch for key %s ", key)
        proc_line(a1[key],d1)
        proc_line(a2[key],d2)
        for (sk in d1) if (d1[sk]!=d2[sk]) {
                printf("%s %s %s.%s; %s.%s", ft, sk, ARGV[1], d1[sk], ARGV[2], d2[sk])
                ft=""
                }
        if (ft=="") print ""
    }

    set_minus(a1,a2, a3)
    for (e in a3) printf("In %s only: %s\n", ARGV[1], e)

    set_minus(a2,a1, a3)
    for (e in a3) printf("In %s only: %s\n", ARGV[2], e)
}

这与 Ruby 和 Python 版本的工作方式相同,也支持第三个文件示例。

祝你好运!


网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接