我再次重新表述我的问题,以便更加清晰。 我的数据长这样。
{
"Research": {
"@xmlns": "http://www.xml.org/2013/2/XML",
"@language": "eng",
"@createDateTime": "2022-03-25T10:12:39Z",
"@researchID": "abcd",
"Product": {
"@productID": "abcd",
"StatusInfo": {
"@currentStatusIndicator": "Yes",
"@statusDateTime": "2022-03-25T12:18:41Z",
"@statusType": "Published"
},
"Source": {
"Organization": {
"@primaryIndicator": "Yes",
"@type": "SellSideFirm",
"OrganizationID": [
{
"@idType": "L1",
"#text": "D827C98E315F"
},
{
"@idType": "TR",
"#text": "3202"
},
{
"@idType": "TR",
"#text": "SZA"
}
],
"OrganizationName": {
"@nameType": "Legal",
"#text": "Citi"
},
"PersonGroup": {
"PersonGroupMember": {
"@primaryIndicator": "Yes",
"@sequence": "1",
"Person": {
"@personID": "tr56",
"FamilyName": "Wang",
"GivenName": "Bond",
"DisplayName": "Bond Wang",
"Biography": "Bond Wang is a",
"BiographyFormatted": "Bond Wang",
"PhotoResourceIdRef": "AS44556"
}
}
}
}
},
"Content": {
"Title": "Premier",
"Abstract": "None",
"Synopsis": "Premier’s solid 1H22 result .",
"Resource": [
{
"@language": "eng",
"@primaryIndicator": "Yes",
"@resourceID": "9553",
"Length": {
"@lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.DFKJG.com/rendition/eppublic"
},
{
"@language": "eng",
"@primaryIndicator": "No",
"@resourceID": "4809",
"Length": {
"@lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "ABS/pdf",
"Name": "asdf.pdf",
"Comments": "fr5.pdf"
},
{
"@language": "eng",
"@primaryIndicator": "No",
"@resourceID": "6d13a965723e",
"Length": {
"@lengthUnit": "Pages",
"#text": "17"
},
"MIMEType": "text/html",
"URL": "https://www.dfgdfg.com/"
},
{
"@primaryIndicator": "No",
"@resourceID": "709c7bdb1c99",
"MIMEType": "tyy/image",
"URL": "https://ir.ght.com"
},
{
"@primaryIndicator": "No",
"@resourceID": "gfjhgj",
"MIMEType": "gtty/image",
"URL": "https://ir.gtty.com"
}
]
},
"Context": {
"@external": "Yes",
"IssuerDetails": {
"Issuer": {
"@issuerType": "Corporate",
"@primaryIndicator": "Yes",
"SecurityDetails": {
"Security": {
"@estimateAction": "Revision",
"@primaryIndicator": "Yes",
"@targetPriceAction": "Increase",
"SecurityID": [
{
"@idType": "RIC",
"@idValue": "PMV.AX",
"@publisherDefinedValue": "RIC"
},
{
"@idType": "Bloomberg",
"@idValue": "PMV@AU"
},
{
"@idType": "SEDOL",
"@idValue": "6699781"
}
],
"SecurityName": "Premier Investments Ltd",
"AssetClass": {
"@assetClass": "Equity"
},
"AssetType": {
"@assetType": "Stock"
},
"SecurityType": {
"@securityType": "Common"
},
"Rating": {
"@rating": "NeutralSentiment",
"@ratingType": "Rating",
"@aspect": "Investment",
"@ratingDateTime": "2020-07-31T08:24:37Z",
"RatingEntity": {
"@ratingEntity": "PublisherDefined",
"PublisherDefinedValue": "Citi"
}
}
}
},
"IssuerID": {
"@idType": "PublisherDefined",
"@idValue": "PMV.AX",
"@publisherDefinedValue": "TICKER"
},
"IssuerName": {
"@nameType": "Legal",
"NameValue": "Premier Investments Ltd"
}
}
},
"ProductDetails": {
"@periodicalIndicator": "No",
"@publicationDateTime": "2022-03-25T12:18:41Z",
"ProductCategory": {
"@productCategory": "Report"
},
"ProductFocus": {
"@focus": "Issuer",
"@primaryIndicator": "Yes"
},
"EntitlementGroup": {
"Entitlement": [
{
"@includeExcludeIndicator": "Include",
"@primaryIndicator": "No",
"AudienceTypeEntitlement": {
"@audienceType": "PublisherDefined",
"@entitlementContext": "TR",
"#text": "20012"
}
},
{
"@includeExcludeIndicator": "Include",
"@primaryIndicator": "No",
"AudienceTypeEntitlement": {
"@audienceType": "PublisherDefined",
"@entitlementContext": "TR",
"#text": "2001"
}
}
]
}
},
"ProductClassifications": {
"Discipline": {
"@disciplineType": "Investment",
"@researchApproach": "Fundamental"
},
"Subject": {
"@publisherDefinedValue": "TREPS",
"@subjectValue": "PublisherDefined"
},
"Country": {
"@code": "AU",
"@primaryIndicator": "Yes"
},
"Region": {
"@primaryIndicator": "Yes",
"@emergingIndicator": "No",
"@regionType": "Australasia"
},
"AssetClass": {
"@assetClass": "Equity"
},
"AssetType": {
"@assetType": "Stock"
},
"SectorIndustry": [
{
"@classificationType": "GICS",
"@code": "25201040",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Household Appliances"
},
{
"@classificationType": "GICS",
"@code": "25504020",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Computer & Electronics Retail"
},
{
"@classificationType": "GICS",
"@code": "25504040",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Specialty Stores"
},
{
"@classificationType": "GICS",
"@code": "25504030",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Home Improvement Retail"
},
{
"@classificationType": "GICS",
"@code": "25201050",
"@focusLevel": "Yes",
"@level": "4",
"@primaryIndicator": "Yes",
"Name": "Housewares & Specialties"
}
]
}
}
}
}
}
我希望将其所有元素展开成数据框。 列数可能会发生变化,且具有类似列表的结构。 基本上我们不知道下一个输入是否需要展开少量或更多列。
这是我迄今为止尝试过的,但看起来它没有给我正确的答案。 此外,我已经硬编码了列值,但它应该识别并展开。
import xmltodict as xmltodict
from pprint import pprint
import pandas as pd
import json
from tabulate import tabulate
dict =(xmltodict.parse("""xml data"""))
json_str = json.dumps(dict)
resp = json.loads(json_str)
print(resp)
df = pd.json_normalize(resp)
cols=['Research.Product.Source.Organization.OrganizationID','Research.Product.Content.Resource','Research.Product.Context.IssuerDetails.Issuer.SecurityDetails.Security.SecurityID','Research.Product.Context.ProductDetails.EntitlementGroup.Entitlement','Research.Product.Context.ProductClassifications.SectorIndustry']
def expplode_columns(df, cols):
df_e = df.copy()
for c in cols:
df_e = df_e.explode(c, ignore_index=True)
return df_e
df2 = expplode_columns(df, cols)
print(tabulate(df2, headers="keys", tablefmt="psql"))
# df2.to_csv('dataframe.csv', header=True, index=False)