在 Django 中生成的 RSS 订阅中插入未转义的 HTML。

5
我正在尝试使用Django创建播客RSS订阅,使用feedgenerator.Rss201rev2Feed。作为一个提供订阅的生成器,它的工作方式有点与BeautifulSoup相反:将信息放入适当的XML标记中。
它工作得很好,但我不想转义所有的HTML。
特别是,我希望rss订阅的<itunes:summary>值显示为这样:<itunes:summary><![CDATA[Link to <a href="http://www.website.com">the website</a>]]></itunes:summary>,根据Apple spec的规定。
如果我在普通视图中渲染html,我可以在html模板中使用|safe filter。现在我需要类似的东西,以有选择地防止rss feed中的<被转义。
也就是说,我需要rss显示为<![CDATA[...]]而不是转义为&lt;![CDATA[...]]&gt; 然而,似乎Django“无论如何都会自动转义RSS feed(或任何XML),无论您是否通过安全过滤器传递它”(参见this 2009 question
到目前为止,尝试使用mark_safe都是无用的。
我也不确定如何解释一个想法,以便在django.contrib.syndication.feeds中的render()调用中传递"autoescape=False"。
escape=False添加到addQuickElement注释中的建议返回了错误。
 handler.addQuickElement(u'itunes:summary',item['summary'], escape=False)
 TypeError: addQuickElement() got an unexpected keyword argument 'escape'

这是一个旧问题,但目前我找不到任何解决方案。

有人知道一种简洁的方法,可以使<![CDATA[...在最终的反馈中显示,而不是转义为&lt;![CDATA[...


编辑:这是我发布此问题时的代码(我尚未尝试将@Lego的答案合并)

import mimetypes

from django.conf import settings
from django.contrib.syndication.views import Feed

# For customising the feed
from django.utils.feedgenerator import Rss201rev2Feed
from django.utils import feedgenerator
# see also https://github.com/blancltd/blanc-basic-podcast/blob/master/blanc_basic_podcast/podcast/itunesfeed.py
# and https://github.com/aneumeier/feeds/blob/master/feeds/rss.py
# and https://github.com/blancltd/blanc-basic-podcast/blob/master/blanc_basic_podcast/podcast/feeds.py
# and https://docs.djangoproject.com/en/1.7/ref/contrib/syndication/#custom-feed-generators

from django.contrib.auth.models import User
from django.shortcuts import get_object_or_404
from django.utils.translation import ugettext_lazy as _
from django.contrib.sites.models import Site

from audiotracks.models import get_track_model, Playlist
Track = get_track_model()

ITEMS_PER_FEED = getattr(settings, 'AUDIOTRACKS_PODCAST_LIMIT', 99)
# MarkAdded @ToDo revisit that default maximum num. tracks per feed

from django.core.urlresolvers import reverse, reverse_lazy


from django_slack import slack_message




######################################################################
##### try adapting code from https://github.com/CaptainHayashi/django-lass-uryplayer/blob/master/uryplayer/feeds.py

from django.utils.feedgenerator import Rss201rev2Feed
from django.contrib.syndication.views import Feed
from django.contrib.sites.models import Site
from django.db.models import permalink
# from uryplayer.models import Podcast
import datetime
# MarkAdded in attempt to have un-escaped <![CDATA[...]]
from django.utils.safestring import mark_safe


# from https://dev59.com/DnVC5IYBdhLWcg3wfxQ8
try:
    from html.parser import HTMLParser  # py3
except ImportError:
    from HTMLParser import HTMLParser  # py2

unescape = HTMLParser().unescape
# print(unescape("&gt;"))
# That proved useless so far



class iTunesPodcastsFeedGenerator(Rss201rev2Feed):

    def rss_attributes(self):
        return {u"version": self._version, u"xmlns:atom": u"http://www.w3.org/2005/Atom", u'xmlns:itunes': u'http://www.itunes.com/dtds/podcast-1.0.dtd'}

    def add_root_elements(self, handler):
        super(iTunesPodcastsFeedGenerator, self).add_root_elements(handler)
        handler.addQuickElement(u'itunes:subtitle', self.feed['subtitle'])
        handler.addQuickElement(u'itunes:author', self.feed['author_name'])
        # handler.addQuickElement(u'itunes:summary', mark_safe(self.feed['description']))
        handler.addQuickElement(u'itunes:summary', unescape(mark_safe(self.feed['description'])))

        # handler.addQuickElement(u'itunes:image', self.feed['iTunes_image_url'])
        handler.addQuickElement('itunes:image', '' , { 'href' : self.feed['iTunes_image_url']})
        # that's from https://gitorious.org/podjango/podjango/commit/621857be0a3d7c44f1925c7daf471c38ea62c180?diffmode=sidebyside

        handler.addQuickElement(u'itunes:explicit', self.feed['iTunes_explicit'])
        handler.startElement(u"itunes:owner", {})
        handler.addQuickElement(u'itunes:name', self.feed['iTunes_name'])
        handler.addQuickElement(u'itunes:email', self.feed['iTunes_email'])
        handler.endElement(u"itunes:owner")

        # @ToDo: add categories

    def add_item_elements(self,  handler, item):
        super(iTunesPodcastsFeedGenerator, self).add_item_elements(handler, item)
        handler.addQuickElement(u'itunes:summary', unescape(item['summary']))
        handler.addQuickElement(u'itunes:explicit',item['explicit'])
        # handler.addQuickElement(u'itunes:image', item['iTunes_image_url'])
        handler.addQuickElement(u'itunes:image', '' , { 'href' : self.feed['iTunes_image_url']})


    # def __unicode__(self):
    #     return unicode(self.order_num)

class iTunesPodcastPost():
    def __init__(self, podcast): # note: podcast here = Track for me
        self.id = podcast.id
        # self.date_submitted = podcast.date_submitted
        self.pub_date = podcast.pub_date
        self.title = podcast.title if podcast.title else "Track"
        self.summary = unescape(podcast.description) if podcast.description else "Cool thing"
        # self.description = mark_safe("<![CDATA[%s]]>" % (podcast.description)) if podcast.description else ""
        self.description = podcast.description if podcast.description else "Cool thing"

        self.enclosure_url = podcast.awe_url # defined in models.py

        self.enclosure_length = podcast.size if podcast.size else 1 # u'unkown duration'
        self.enclosure_mime_type = u'audio/mpeg' # @ToDo generalise once we have other types
        self.explicit = u'yes' if podcast.explicit else u'no'
        self.url = podcast.get_absolute_url

        self.iTunes_image_url = podcast.main_image_url # if podcast.main_image_url else 'http://fun.com'

        self.length = podcast.time_duration if podcast.time_duration else 11 # "TBD"
        self.user_id = podcast.user_id
        self.user = User.objects.get(id = podcast.user_id)
        self.slug = podcast.slug
        self.duration = podcast.time_duration if podcast.time_duration else "5:00" # "Duration TBC"

        # if categories:
        #     categories = [to_unicode(c) for c in categories]
        # see https://docs.djangoproject.com/en/1.7/_modules/django/utils/feedgenerator/#SyndicationFeed.add_root_elements

    def __unicode__(self):
        return self.title

    def get_absolute_url(self):
        # return "%s" % self.url()
        # user = User.objects.get(id=self.user_id)
        return reverse('track_detail', args=[self.user.username, self.slug]) 


class iTunesPodcastsFeed(Feed):
    """
    A feed of podcasts for iTunes and other compatible podcatchers.
    Based on https://github.com/CaptainHayashi/django-lass-uryplayer/blob/master/uryplayer/feeds.py
    """

    def get_object(self, request, username, playlist_slug):
        self.request = request
        # return get_object_or_404(User, username=username)
        user = get_object_or_404(User, username=username)
        return get_object_or_404(Playlist, user_id=user.id, slug=playlist_slug)

    def link(self, playlist):
        # return self.request.build_absolute_uri("/@%s/" % user.username)
        user = User.objects.get(id=playlist.user_id)
        return reverse('playlist_index', args=[user.username, playlist.slug])

    def title(self, playlist):

        return playlist.title


    # description_template = mark_safe("defaults/playlist_description_missing.html")
    # not using that

    def description(self, playlist):
        if playlist.description:
            return playlist.description
            # return mark_safe("<![CDATA[%s]]>" % (playlist.description))
            # No, I won't wrap in CDATA until I can avoid escaping the "<" signs here

        else:
            return "[Auto text] The creator has not written a description."
            # return render_to_string("defaults/playlist_description_missing.txt")
            # pass


    def iTunes_image_url(self, obj): # TypeError: coercing to Unicode: need string or buffer, instancemethod found
        if obj.main_image_url:
            return unicode(obj.main_image_url) # obj.main_image_url
        else:
            return u'https://dl.dropboxusercontent.com/u/16441973/publicstatic/img/playlist-icon.png'


    # author_name = 'University Radio York'
    # modified from https://github.com/aneumeier/feeds/blob/master/feeds/rss.py
    def author_name(self, obj): # obj = playlist
        """
        Return the author for this feed.
        The feed is in `obj`, provided by `get_object`
        """
        if obj.author:
            return u"%s" % obj.author
        else:
            return 'Playlist created by %s' % (obj.user.username)

    def subtitle(self, obj): # obj = playlist
        """
        Return the author for this feed.
        The feed is in `obj`, provided by `get_object`
        """
        if obj.subtitle:
            return u"%s" % obj.author
        else:
            return '%s created in 2015' % (obj.title)

    # def summary(self, obj):
    #     return obj.description

    # @ToDo: finish adapting rest of this from the hard-coded URY values to actual values for my implementation

    iTunes_name = u'Hard-coded iTunes name for now'
    iTunes_email = u'm@rkmoriarty.com' 
    # @ToDo: make dynamic, not hard-coded


    iTunes_explicit = u'no'
    feed_type = iTunesPodcastsFeedGenerator
    feed_copyright = "Copyright 1967-%s University Radio York" % datetime.date.today().year



    def feed_extra_kwargs(self, playlist):
        extra = {}
        extra['iTunes_name'] = self.iTunes_name
        extra['iTunes_email'] = self.iTunes_email
        # extra['iTunes_image_url'] = self.iTunes_image_url
        def get_image(self, playlist):
            if playlist.main_image_url:
                return playlist.main_image_url
            else:
                return "https://dl.dropboxusercontent.com/u/16441973/publicstatic/img/rss_playlist_icon_placeholder.png"
                # @ToDo: replace with Awesound logo
            # return render_to_string("defaults/playlist_description_missing.txt")
            # pass

        extra['iTunes_image_url'] = get_image(self, playlist)
        extra['iTunes_explicit'] = self.iTunes_explicit

        return extra


    def items(self, playlist):
        """
        Returns a list of items to publish in this feed.
        """
        posts = playlist.tracks.all().order_by('-pub_date').order_by("-created_at")[:99]
        posts = [iTunesPodcastPost(item) for item in posts]
        return posts

    def item_extra_kwargs(self, item):
        return {'summary':unescape(mark_safe(item.description)), 
            'explicit':item.explicit,   
            'iTunes_image_url':item.iTunes_image_url}
            # was summary: item.summary

    # MarkAdded
    def item_link(self, item):
        # return item.enclosure_length
        if item.user_id:
            # we have a normal track created by a user
            # user = User.objects.get(id = item.user_id)
            return reverse('track_detail', args=[item.user.username, item.slug])
        else:
            # we have a funny track without a user, e.g., created via command line
            return 'Exception:TrackWithoutUser'

    def item_pubdate(self, item):
        return item.pub_date

    def item_enclosure_url(self, item):
        return item.enclosure_url


    def item_enclosure_length(self, item):
        # return item.enclosure_length
        return item.length

    def item_enclosure_mime_type(self, item):
        # return item.enclosure_mime_type
        return 'audio/mpeg' # @ToDo: make dynamic

    def item_description(self, item):
        # return item.summary
        if item.description:
            return unescape(mark_safe(item.description))
        else:
            return "User has not written a description. This is an automatic message"


# current_site = Site.objects.get_current()
current_site = 'https://greatsite.com'
iTunes_feed = iTunesPodcastsFeed()



### the above will be called if both username and playlist_slug are deteced in the url
### there are two older methods to handle other situations

class AllTracks(Feed):
    #
    # working old method, not relevant to html escaping question
    #


class UserTracks(AllTracks):
    #
    # working old method, not relevant to my question
    #

all_tracks = AllTracks()
user_tracks = UserTracks()

### note, both of those are also subject to full html escaping also



def choose_feed(request, *args, **kwargs):
    """
    Pick up the user feed or the global feed depending on whether or not the
    URL contains a username parameter
    """
    # feed = user_tracks if 'username' in kwargs else all_tracks
    if 'username' in kwargs:
        if 'playlist_slug' in kwargs:
            # feed = podcast_feed  
            slug = kwargs['playlist_slug']
            feed = iTunes_feed
            if request.user:
                user = request.user
                slack_message('slackmessages/playlist_feed.slack', { #django_slack/slackmessages/
                    'playlist': Playlist.objects.get(slug=slug),
                    'user':user,
                    })
        else:

            feed = user_tracks
    else:
        feed = all_tracks     


    return feed.__call__(request, *args, **kwargs)

1
你能添加一些视图代码吗?我以前没有使用过Django的RSS功能,但我可能可以提供一些帮助。 - user764357
非常感谢。当然,我会编辑以在上面进行大量代码转储。我基本上只是复制了https://github.com/CaptainHayashi/django-lass-uryplayer/blob/master/uryplayer/feeds.py,并对其进行了调整以适应我的models.py的变量名称。我已经定义了“播放列表”(类似于播客系列,有一个feed),以及它的项目,即“Track”模型(“播客”剧集)。 - Mark
6个回答

2
您可以替换代码:
    contents = '<![CDATA[ contents ]]'
    xml.addQuickElement('element', contents=contents)

使用:

    contents = 'contents'
    xml.startElement('element', {})
    xml._write(f'<![CDATA[ {contents} ]]')
    xml.endElement('element')

2
这仍然是谷歌上与此问题相关的最热门搜索结果,因此根据Nick的回答在这里,这里提供了完整的解答。
from xml.sax.saxutils import XMLGenerator

class MySimplerXMLGenerator(XMLGenerator):
    def addQuickElement(self, name, contents=None, attrs=None):
        "Convenience method for adding an element with no children"
        if attrs is None:
            attrs = {}
        self.startElement(name, attrs)
        if contents is not None:
            if contents.startswith('<![CDATA['):
                self.unescaped_characters(contents)
            else:
                self.characters(contents)
        self.endElement(name)

    def characters(self, content):
        if content and re.search(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', content):
            # Fail loudly when content has control chars (unsupported in XML 1.0)
            # See https://www.w3.org/International/questions/qa-controls
            raise UnserializableContentError("Control characters are not supported in XML 1.0")
        XMLGenerator.characters(self, content)

    def unescaped_characters(self, content):
        if content and re.search(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', content):
            # Fail loudly when content has control chars (unsupported in XML 1.0)
            # See https://www.w3.org/International/questions/qa-controls
            raise UnserializableContentError("Control characters are not supported in XML 1.0")
        XMLGenerator.ignorableWhitespace(self, content)

    def startElement(self, name, attrs):
        # Sort attrs for a deterministic output.
        sorted_attrs = dict(sorted(attrs.items())) if attrs else attrs
        super().startElement(name, sorted_attrs)

上述内容与Django的处理程序相同,但添加了额外的“unescaped_characters”方法,并在“content”上进行条件检查,以查看它是否以“<![CDATA['开头。正如您所看到的,unescaped_characters调用saxutils的XMLGenerator的“ignoreWhitespace”方法,它与其“characters”方法相同,除了不转义任何内容。
从这一点出发,您可以向Feed类添加一个新的“write()”方法,应按照Django代码中的注释所述进行操作,覆盖处理程序方法,提供您修改后的处理程序,如下所示,与原始处理程序类定义相同,但替换了处理程序类定义。
class iTunesPodcastsFeedGenerator(Rss201rev2Feed):

    def write(self, outfile, encoding):
        handler = MySimplerXMLGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

现在您有了一个新的处理程序,它会选择性地不转义以“<![CDATA[”开头的字符串,因此您只需要手动在HTML字段中添加前/后缀标记,并通过其他方式转义这些字符串,例如,如果您安装并想要使用漂白,则可以这样做...
 class iTunesPodcastsFeedGenerator(Rss201rev2Feed):

    def add_root_elements(self, handler):
        handler.addQuickElement("description", '<![CDATA[' + bleach.clean(self.feed['description'], strip=True, tags=['p', 'ul', 'li', 'a']) + ']]>')

截至目前,苹果(以及其他大部分播客目录)允许在描述中使用段落、无序列表和链接,因此上述是一个可以正常工作的播客源示例。

1
谢谢。我有一些自定义代码 https://github.com/philgyford/django-hines/blob/30fb84c1a83663866028295b8a6692ae7989e098/hines/core/feeds.py#L58-L77,它很愉快地向项中添加了 <content:encoded><![CDATA[...]]></content:encoded>,直到我更新了什么东西(到 Django 4.0?),编码中的某些内容发生了变化。你的解决方案让一切重新运作起来了。 - Phil Gyford

1
根据文档,handler是一个XMLGenerator,调用addQuickElement的假设是所有内容都是字符数据。这就是为什么它被转义的原因。
你可能需要覆盖SyndicationFeed.add_item_elements(self, handler, item)并使用addQuickElement插入a元素,并使用startElementendElement添加itunes:summary标签。
class iTunesFeed(Rss201rev2Feed):
    def add_item_elements(self, handler item):
        super(iTunesFeed, self).add_root_elements(handler)
        handler.startElement('itunes:summary')
        handler.characters('Link to ')            
        handler.addQuickElement('a', 'the website', {'href':'http://www.website.com'})
        handler.endElement('itunes:summary')

这可能不是完全可用的,但应该可以让您接近目标。

朝着正确的方向前进!仅通过使用您的代码,我就完成了50%的工作,但<a>元素未转义<标签。初始的<![CDATA[<仍然被转义。结果为<itunes:summary><![CDATA[Link to <a href="http://www.website.com">the website</a></itunes:summary> 我该如何“覆盖SyndicationFeed.add_item_elements(self, handler, item)”以尝试更改此内容? - Mark
我尝试制作自己的自定义RssFeed(SyndicationFeed),重新编写add_root_elements方法,但是没有运气,即使尝试应用所有种类的mark_safe(),unescape(...)和html_decode()函数受到SO启发...仍然无法阻止'<'符号被转义为<。完全不知所措。 - Mark

1

以下是我如何在输出中使用CDATA标记而不被转义的方法。我创建了一个继承自Rss20rev2Feed默认使用的SimplerXMLGenerator的AppleGenerator。然后我重写了Rss201rev2feed使用的write函数,以使用我创建的新的AppleGenerator。对于AppleGenerator,我重写了characters和addQuickElement函数,以接受可选输入来禁用转义。

from django.utils.xmlutils import SimplerXMLGenerator
from xml.sax.saxutils import escape

class AppleGenerator(SimplerXMLGenerator):
    def addQuickElement(self, name, contents=None, attrs=None, escape_char=True):
        "Convenience method for adding an element with no children"
        if attrs is None: attrs = {}
        self.startElement(name, attrs)
        if contents is not None:
            self.characters(contents, escape_char=escape_char)
        self.endElement(name)

    def characters(self, content, escape_char=True):
        if content:
            self._finish_pending_start_element()
            if not isinstance(content, str):
                content = str(content, self._encoding)
            if escape_char:
                self._write(escape(content))
            else:
                self._write(content)

class ApplePodcastsFeedGenerator(Rss201rev2Feed):
    def write(self, outfile, encoding):
        handler = AppleGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

这些覆盖实际上与函数之前的功能基本相同,但增加了一种不转义它们的方式。以下是saxutils的源代码:

https://github.com/python/cpython/blob/3.7/Lib/xml/sax/saxutils.py

这是Django SimplerXMLGenerator的源代码: https://github.com/django/django/blob/master/django/utils/xmlutils.py


0

我在Django 1.10中面临着同样的问题,并追溯到发生转义的位置。 django.utils.RssFeed.write()使用django.utils.xmlutils.SimplerXMLGenerator作为处理程序来编写项目。该处理程序派生自xml.sax.saxutils.XMLGenerator,它具有转义所有内容的characters方法。因此,要取消转义放入提要中的所有内容,请首先重写XML处理程序:

from django.utils.xmlutils import SimplerXMLGenerator
class UnescapedXMLGenerator(SimplerXMLGenerator):
    def characters(self, content):
        """
        code is mainly copy-paste from Django 1.10 SimplerXMLGenerator.characters
        """
        if content and re.search(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', content):
            # Fail loudly when content has control chars (unsupported in XML 1.0)
            # See http://www.w3.org/International/questions/qa-controls
            raise UnserializableContentError("Control characters are not supported in XML 1.0")

        # next part from sax.saxutils.XMLGenerator, but without escaping
        if not isinstance(content, unicode):
            content = unicode(content, self._encoding)
        self._write(content)

下一步是覆盖您的Feed的写入方法以使用新的处理程序。这里以Rss2.01 Feed为例:
from django.utils import feedgenerator
class Rss201rev2FeedUnescaped(feedgenerator.Rss201rev2Feed):
    """
    Rss 2.01 Feed that doesn't escape content
    """
    def write(self, outfile, encoding):
        """
        code is mainly copy-paste from django.utils.feedgenerator.Rss201rev2Feed
        except that the handler is set to UnescapedXMLGenerator
        """
        handler = UnescapedXMLGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

1
handler.ignorableWhitespace()可以做到你所描述的事情,而不需要覆盖处理程序。这与handler.characters()相同,但在写入时没有转义。 - Nick Bisby

0
我所做的是使用下面的代码改进了 @RNC 的答案。
from django.utils.feedgenerator import Rss201rev2Feed
from django.utils.xmlutils import SimplerXMLGenerator
from django.contrib.syndication.views import Feed

#override django's simplerXMLGenerator class

class CustomXMLGenerator(SimplerXMLGenerator):
"""Override defaults django XML Generator to allow writing contents with CDATA prefix"""

def addQuickElement(self, name, contents=None, attrs=None):
    "Convenience method for adding an element with no children"
    if attrs is None:
        attrs = {}
    self.startElement(name, attrs)
    if contents is not None:
        if contents.startswith("<![CDATA["):
            # this is the main function that ignores the whitespace and doesn't escape the content
            self.ignorableWhitespace(contents)
        else:
            self.characters(contents)
    self.endElement(name)


class RSSFeedMixin(Rss201rev2Feed):
"""The wrapper class for the base RSSFeed class"""

    def write(self, outfile, encoding):
        #point to the custom class

        handler = CustomXMLGenerator(outfile, encoding)
        handler.startDocument()
        handler.startElement("rss", self.rss_attributes())
        handler.startElement("channel", self.root_attributes())
        self.add_root_elements(handler)
        self.write_items(handler)
        self.endChannelElement(handler)
        handler.endElement("rss")

class GlobalFeed(Feed):


    def wrap_with_cdata(self, text):
        """Utility method to wrap a text in CDATA block"""
        content = "<![CDATA[ "
        content += text
        content += " ]]>"

        return content

       ...

    def item_author_name(self, item: Article) -> str:
        """
        Takes an item, as returned by items(), and returns the item's
        author's name as a normal Python string.
        """
        #wrap with the utility method
        return self.wrap_with_cdata(item.author.fullname)

希望能有所帮助。

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接