我想比较两个文件,并显示两个文件中的差异和缺失记录。
根据这个论坛上的建议,我发现awk是最快的解决方案。
基于组合键 - match_key 和 issuer_grid_id 进行比较。
代码:
BEGIN { FS="[= ]" }
{
match(" "$0,/ match_key="[^"]+"/)
key = substr($0,RSTART,RLENGTH)
}
NR==FNR {
file1[key] = $0
next
}
{
if ( key in file1 ) {
nf = split(file1[key],tmp)
for (i=1; i<nf; i+=2) {
f1[key,tmp[i]] = tmp[i+1]
}
msg = sep = ""
for (i=1; i<NF; i+=2) {
if ( $(i+1) != f1[key,$i] ) {
msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
sep = ","
}
}
if ( msg != "" ) {
print "Mismatch in row " FNR msg
}
delete file1[key]
}
else {
file2[key] = $0
}
}
END {
for (key in file1) {
print "In file1 only:", key, file1[key]
}
for (key in file2) {
print "In file2 only:", key, file2[key]
}
}
文件1:
period="2021-02-28" book_base_ent_cd="U0028" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="USD" issuer_grid_id="2" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="3" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA22"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"
文件2:
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="3" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA20"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA23"
period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"
文件3(它只有一行,但字段数量更多)
period="2021-02-28" book_base_ent_cd="U0027" other_inst_ident="PLCHS258Q463" rep_nom_curr="PLN" reporting_basis="Unit" src_instr_class="Debt" mat_date="2026-08-25" nom_curr="PLN" primary_asset_class="Bond" seniority_type="931" security_status="alive" issuer_name="CUST38677608" intra_group_prud_scope="Issuer is not part of the reporting group" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_frbrnc_stts="NOFRBRNRNGT" prfrmng_stts="Performing" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" src_imprmnt_assssmnt_mthd="COLLECTIVE" accmltd_imprmnt="78.54" accmltd_chngs_fv_cr="0" expsr_vl="0" unit_measure="EUR" unit_measure_nv="EUR" crryng_amnt="24565.13" issuer_grid_id="38677608" match_key="PLCHS258Q463"
期望输出:
In file1 only : issuer_grid_id="2" match_key="PLCHS252SA22"
In file2 only : issuer_grid_id="2" match_key="PLCHS252SA23"
Mismatch for issuer_grid_id="2" match_key="PLCHS252SA20" : file1.book_base_ent_cd="U0028" file2.book_base_ent_cd="U0027", file1.unit_measure="USD" file2.unit_measure="EUR"
实际输出
awk -f compare.awk file1 file2
Mismatch in row 1 for file1.issuer_grid_id="2" file2.issuer_grid_id="3", file1.match_key="PLCHS252SA21" file2.match_key="PLCHS252SA20"
In file2 only: period="2021-02-28" book_base_ent_cd="U0027" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="EUR" issuer_grid_id="2" match_key="PLCHS252SA21"
我找不到进行多字段比较的方法?
如有任何建议,将不胜感激。我还标记了Python,如果有更快的方法,请告知。
最好的问候。
=
作为字段分隔符。所以正如我在第一篇答案的开头所说的那样,假设您提供的示例中的引用字符串不包含空格、=或",即代码并不设计来处理这些内容。请就如何处理带引号字段中的空格和/或=
和/或"
提出一个新问题,而不是关于如何处理多字段键的问题。 - Ed Morton