我希望从一个包含大量数据的对象(包含嵌套集合)中生成XML文件。
但是XML有一个限制,它不能超过50MB。
有没有好的方法来解决这个问题呢?
更新:速度不重要,主要是将文件拆分为每个50MB。
XmlWriter
编写代码,并监控底层流。当它超过我的文件大小限制时,我完成当前的 Xml 片段,保存文件并关闭流。// arbitrary limit of 10MB
long FileSizeLimit = 10*1024*1024;
// open file stream to monitor file size
using (FileStream file = new FileStream("some.data.xml", FileMode.Create))
using (XmlWriter writer = XmlWriter.Create(file))
{
writer.WriteStartElement("root");
// while not greater than FileSizeLimit
for (; file.Length < FileSizeLimit; )
{
// write contents
writer.WriteElementString(
"data",
string.Format("{0}/{0}/{0}/{0}/{0}", Guid.NewGuid()));
}
// complete fragment; this is the trickiest part,
// since a complex document may have an arbitrarily
// long tail, and cannot be known during file size
// sampling above
writer.WriteEndElement();
writer.Flush();
}
// iteratively reduce document size
// NOTE: XDocument will load full DOM into memory
XDocument document = XDocument.Load("some.data.xml");
XElement root = document.Element("root");
for (; new FileInfo("some.data.xml").Length > FileSizeLimit; )
{
root.LastNode.Remove();
document.Save("some.data.xml");
}
// performs a shallow copy of a given node. courtesy of Mark Fussell
// http://blogs.msdn.com/b/mfussell/archive/2005/02/12/371546.aspx
public static void WriteShallowNode(this XmlWriter writer, XmlReader reader)
{
switch (reader.NodeType)
{
case XmlNodeType.Element:
writer.WriteStartElement(
reader.Prefix,
reader.LocalName,
reader.NamespaceURI);
writer.WriteAttributes(reader, true);
if (reader.IsEmptyElement)
{
writer.WriteEndElement();
}
break;
case XmlNodeType.Text: writer.WriteString(reader.Value); break;
case XmlNodeType.Whitespace:
case XmlNodeType.SignificantWhitespace:
writer.WriteWhitespace(reader.Value);
break;
case XmlNodeType.CDATA: writer.WriteCData(reader.Value); break;
case XmlNodeType.EntityReference:
writer.WriteEntityRef(reader.Name);
break;
case XmlNodeType.XmlDeclaration:
case XmlNodeType.ProcessingInstruction:
writer.WriteProcessingInstruction(reader.Name, reader.Value);
break;
case XmlNodeType.DocumentType:
writer.WriteDocType(
reader.Name,
reader.GetAttribute("PUBLIC"),
reader.GetAttribute("SYSTEM"),
reader.Value);
break;
case XmlNodeType.Comment: writer.WriteComment(reader.Value); break;
case XmlNodeType.EndElement: writer.WriteFullEndElement(); break;
}
}
还需要一个方法来执行修剪 (不是扩展方法,因为扩展任何参数类型都有点模棱两可)。
// trims xml file to specified file size. does so by
// counting number of "victim candidates" and then iteratively
// trimming these candidates one at a time until resultant
// file size is just less than desired limit. does not
// consider nested victim candidates.
public static void TrimXmlFile(string filename, long size, string trimNodeName)
{
long fileSize = new FileInfo(filename).Length;
long workNodeCount = 0;
// count number of victim elements in xml
if (fileSize > size)
{
XmlReader countReader = XmlReader.Create(filename);
for (; countReader.Read(); )
{
if (countReader.NodeType == XmlNodeType.Element &&
countReader.Name == trimNodeName)
{
workNodeCount++;
countReader.Skip();
}
}
countReader.Close();
}
// if greater than desired file size, and there is at least
// one victim candidate
string workFilename = filename+".work";
for (;
fileSize > size && workNodeCount > 0;
fileSize = new FileInfo(filename).Length)
{
workNodeCount--;
using (FileStream readFile = new FileStream(filename, FileMode.Open))
using (FileStream writeFile = new FileStream(
workFilename,
FileMode.Create))
{
XmlReader reader = XmlReader.Create(readFile);
XmlWriter writer = XmlWriter.Create(writeFile);
long j = 0;
bool hasAlreadyRead = false;
for (; (hasAlreadyRead) || reader.Read(); )
{
// if node is a victim node
if (reader.NodeType == XmlNodeType.Element &&
reader.Name == trimNodeName)
{
// if we have not surpassed this iteration's
// allowance, preserve node
if (j < workNodeCount)
{
writer.WriteNode(reader, true);
}
j++;
// if we have exceeded this iteration's
// allowance, trim node (and whitespace)
if (j >= workNodeCount)
{
reader.ReadToNextSibling(trimNodeName);
}
hasAlreadyRead = true;
}
else
{
// some other xml content we should preserve
writer.WriteShallowNode(reader);
hasAlreadyRead = false;
}
}
writer.Flush();
}
File.Copy(workFilename, filename, true);
}
File.Delete(workFilename);
}
如果您的Xml包含空格格式化,最后一个受害者节点与关闭容器元素标记之间的任何空格都会丢失。可以通过更改跳过子句(将j ++
语句移到跳过后面),但这样会导致额外的空格。上面提供的解决方案生成源文件的最小文件大小副本。
您可以使用 XmlWriter 或 XDocument 轻松编写大型xml文件,无需担心任何问题。
以下是一个示例。此示例在不到5秒钟的时间内生成了一个63MB的xml文件。对于此示例,我使用了类 XmlWriter。
using (XmlWriter writer = XmlWriter.Create("YourFilePath"))
{
writer.WriteStartDocument();
writer.WriteStartElement("Root");
for (int i = 0; i < 1000000; i++) //Write one million nodes.
{
writer.WriteStartElement("Root");
writer.WriteAttributeString("value", "Value #" + i.ToString());
writer.WriteString("Inner Text #" + i.ToString());
writer.WriteEndElement();
}
writer.WriteEndElement();
writer.WriteEndDocument();
}
你有没有考虑过将XML文件写成字符串,而不是使用.NET中的XML支持?
我曾经将大约10GB的数据写入XML,因为这是唯一一个工具可以使用的方式。
我也遇到过类似的问题,但我的XML非常简单,所以我只是使用了TextWriter和嵌套的for循环来编写XML。
效果很好,而且比XML对象快得多。