如何在加载到XDocument时解析实体?

9
我将尝试将一个XHTML文档加载到XDocument中,但是我收到了“引用未声明的实体”的异常。 我需要解决像®»这样的实体。我相信我的文档格式正确,以下是头部内容:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">

当我执行XDocument.Load(<StringReader>)时,就会抛出这些异常。

2个回答

10

这是 msdn 和博客文章的合作。

        XDocument document;

        using (var stringReader = new StringReader(output))
        {
            var settings = new XmlReaderSettings
            {
                ProhibitDtd = false,
                XmlResolver = new LocalXhtmlXmlResolver(bool.Parse(ConfigurationManager.AppSettings["CacheDTDs"]))
            };

            document = XDocument.Load(XmlReader.Create(stringReader, settings));
        }

    private class LocalXhtmlXmlResolver : XmlUrlResolver
    {
        private static readonly Dictionary<string, Uri> KnownUris = new Dictionary<string, Uri>
        {
            { "-//W3C//DTD XHTML 1.0 Strict//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd") },
            { "-//W3C XHTML 1.0 Transitional//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd") },
            { "-//W3C//DTD XHTML 1.0 Transitional//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd") },
            { "-//W3C XHTML 1.0 Frameset//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd") },
            { "-//W3C//DTD XHTML 1.1//EN", new Uri("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd") }
        };

        private bool enableHttpCaching;
        private ICredentials credentials;

        public LocalXhtmlXmlResolver(bool enableHttpCaching)
        {
            this.enableHttpCaching = enableHttpCaching;
        }

        public override Uri ResolveUri(Uri baseUri, string relativeUri)
        {
            Debug.WriteLineIf(!KnownUris.ContainsKey(relativeUri), "Could not find: " + relativeUri);

            return KnownUris.ContainsKey(relativeUri) ? KnownUris[relativeUri] : base.ResolveUri(baseUri, relativeUri);
        }

        public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn)
        {
            if (absoluteUri == null)
            {
                throw new ArgumentNullException("absoluteUri");
            }

            //resolve resources from cache (if possible)
            if (absoluteUri.Scheme == "http" && this.enableHttpCaching && (ofObjectToReturn == null || ofObjectToReturn == typeof(Stream)))
            {
                var request = WebRequest.Create(absoluteUri);

                request.CachePolicy = new HttpRequestCachePolicy(HttpRequestCacheLevel.Default);

                if (this.credentials != null)
                {
                    request.Credentials = this.credentials;
                }

                var response = request.GetResponse();

                return response.GetResponseStream();
            }

            //otherwise use the default behavior of the XmlUrlResolver class (resolve resources from source)
            return base.GetEntity(absoluteUri, role, ofObjectToReturn);
        }
    }

8
从Web解析DTD通常是不明智的做法——除了不必要地向W3C服务器发送请求外,这还相当缓慢,并且依赖于可用和可靠的Internet连接。一个更好的方法是将这些DTD作为资源存储在本地副本中,并通过Assembly.GetManifestResourceStream加载它们;或者将它们作为本地文件放在与可执行文件相同的目录中。 - Pavel Minaev

9
我和 Dave 遇到了相同的问题,然后找到了这个问题并得到了很大帮助。基于 Dave 的答案和 Pavel 的优化建议,我更新了这个类。现在 DTDs 可以作为内嵌资源存储,并在必要时加载。我知道这篇文章已经几年了,但也许对某个人会有所帮助。 用法示例:
XmlReaderSettings readerSettings = new XmlReaderSettings
    {
        DtdProcessing = DtdProcessing.Parse,
        XmlResolver = new LocalXhtmlXmlResolver()
    };

using (XmlReader reader = XmlReader.Create(xhtmlStream, readerSettings))
{
    XDocument xhtml = XDocument.Load(reader);
    ...
}

LocalXhtmlXmlResolver类:
该类用于解析本地的XHTML和XML文件。
public class LocalXhtmlXmlResolver : XmlUrlResolver
{
    private const string ResourcePrefix = "Your.Namespace.Here.";

    private static readonly Dictionary<string, string> _knownDtds = new Dictionary<string, string>
        {
            { "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd", ResourcePrefix + "xhtml1-strict.dtd" },
            { "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd", ResourcePrefix + "xhtml1-transitional.dtd" },
            { "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd", ResourcePrefix + "xhtml1-frameset.dtd" },
            { "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd", ResourcePrefix + "xhtml11.dtd" },
            { "http://www.w3.org/TR/xhtml1/DTD/-//W3C//ENTITIES Latin 1 for XHTML//EN", ResourcePrefix + "xhtml-lat1.ent" },
            { "http://www.w3.org/TR/xhtml1/DTD/-//W3C//ENTITIES Special for XHTML//EN", ResourcePrefix + "xhtml-special.ent" },
            { "http://www.w3.org/TR/xhtml1/DTD/-//W3C//ENTITIES Symbols for XHTML//EN", ResourcePrefix + "xhtml-symbol.ent" }
        };

    private static readonly Dictionary<string, Uri> _knownUris = new Dictionary<string, Uri>
        {
            { "-//W3C//DTD XHTML 1.0 Strict//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd") },
            { "-//W3C XHTML 1.0 Transitional//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd") },
            { "-//W3C//DTD XHTML 1.0 Transitional//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd") },
            { "-//W3C XHTML 1.0 Frameset//EN", new Uri("http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd") },
            { "-//W3C//DTD XHTML 1.1//EN", new Uri("http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd") }
        };

    public override Uri ResolveUri(Uri baseUri, string relativeUri)
    {
        return _knownUris.ContainsKey(relativeUri) ? _knownUris[relativeUri] : base.ResolveUri(baseUri, relativeUri);
    }

    public override object GetEntity(Uri absoluteUri, string role, Type ofObjectToReturn)
    {
        if (absoluteUri == null)
        {
            throw new ArgumentNullException("absoluteUri");
        }

        if (_knownDtds.ContainsKey(absoluteUri.OriginalString))
        {
            string resourceName = _knownDtds[absoluteUri.OriginalString];
            Assembly assembly = Assembly.GetAssembly(typeof(LocalXhtmlXmlResolver));
            return assembly.GetManifestResourceStream(resourceName);
        }

        return base.GetEntity(absoluteUri, role, ofObjectToReturn);
    }
}

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接