input : this is test <b> bold text </b> normal text
expected output: this is test normal text
即删除指定标签的内容。
input : this is test <b> bold text </b> normal text
expected output: this is test normal text
BeautifulSoup
的解决方案:from BeautifulSoup import BeautifulSoup
def removeTag(soup, tagname):
for tag in soup.findAll(tagname):
contents = tag.contents
parent = tag.parent
tag.extract()
s = BeautifulSoup("abcd <b> btag </b> hello <d>dtag</d>")
removeTag(s,"b")
print s
removeTag(s, "d")
print s
返回:
>>>
abcd hello <d>dtag</d>
abcd hello
使用BeautifulSoup:
from BeautifulSoup import BeautifulSoup
''.join(BeautifulSoup(page).findAll(text=True))
这篇文章摘自http://www.ghastlyfop.com/blog/2008/12/strip-html-tags-from-string-python.html
,介绍了如何使用Python去除字符串中的HTML标签。BeautifulSoup(page).get_text()
。 - Honza Javorek如果你不介意使用Python(尽管正则表达式是通用的),你可以从Django的strip_tags过滤器中获取一些灵感,详情请查看这里。
为了完整起见,这里重现一下-
def strip_tags(value):
"""Returns the given HTML with all tags stripped."""
return re.sub(r'<[^>]*?>', '', force_unicode(value))
注:如果您正在使用此方法或其他正则表达式解决方案,请记住,它可以允许一些精心制作的HTML(请参见评论),以及HTML注释,因此不应与不受信任的输入一起使用。相反,考虑使用beautifulsoup、html5lib或lxml等答案来处理不可信的输入。
strip_tags
过滤器存在问题,几乎无用。在属性值中放置>
字符是完全有效的,而且它甚至不尝试处理其他标记构造,如注释。 - bobince尝试使用以下步骤:
import re
input = 'this is test <b> bold text </b> normal text'
output = re.compile(r'<[^<]*?/?>').sub('', input)
print output
看起来你需要 HTMLParser
。(在Python 3中是html.parser
。)
from HTMLParser import HTMLParser
from sys import stdout
class Filter(HTMLParser):
def __init__(self, ignored_tags):
super(Filter, self).__init__()
self.ignorelevel = 0
self. ignored_tags = ignored_tags
def handle_starttag(self, tag, attrs):
if self.ignorelevel > 0:
self.ignorelevel += 1
elif tag in self.ignored_tags:
self.ignorelevel = 1
else:
# One of these two. Test and see.
stdout.write(self.get_starttag_text())
#stdout.write('<' + self.get_starttag_text() + '>')
def handle_startendtag(self, tag, attrs):
if self.ignorelevel == 0 and tag not in self.ignored_tags:
# One of these two. Test and see.
stdout.write(self.get_starttag_text())
#stdout.write('<' + self.get_starttag_text() + '/>')
def handle_endtag(self, tag):
if self.ignorelevel > 0:
self.ignorelevel -= 1
if self.ignorelevel > 0:
return
stdout.write('</' + tag + '>')
def handle_data(self, data):
stdout.write(data)
def handle_charref(self, name):
stdout.write('&#' + name + ';')
def handle_entityref(self, name):
stdout.write('&' + name + ';')
def handle_comment(self, data):
stdout.write('<!-- ' + data + ' -->')
def handle_decl(self, data):
stdout.write('<!' + data + '>')
def handle_pi(self, data):
stdout.write('<?' + data + '>')
据我所知,Sam的答案应该可以很好地完成所需的工作,但最好确保任何剩余的<>字符都被替换为<和>,以防止误用/无效的HTML。
这种方法的优点是它可以接受极其畸形的HTML引用/标签。BeautifulSoup也可以很好地处理畸形标签,但html5lib、sgmllib和htmllib可能会在无效代码上出现问题,如果我没记错的话,有些更多。
以下代码还验证了& HTML引用:
import re
from htmlentitydefs import name2codepoint, codepoint2name
S = '1234567890ABCDEF'
DHex = {}
for i in S:
DHex[i.lower()] = None
DHex[i.upper()] = None
def IsHex(S):
if not S: return False
for i in S:
if i not in DHex:
return False
return True
def UnEscape(S, LReEscape=None):
# Converts HTML character references into a unicode string to allow manipulation
#
# If LUnEscape is provided, then the positions of the escaped characters will be
# added to allow turning the result back into HTML with ReEscape below, validating
# the references and escaping all the rest
#
# This is needed to prevent browsers from stripping out e.g.   (spaces) etc
re = LReEscape != None
LRtn = []
L = S.split('&')
xx = 0
yy = 0
for iS in L:
if xx:
LSplit = iS.split(';')
if LSplit[0].lower() in name2codepoint:
# A character reference, e.g. '&'
a = unichr(name2codepoint[LSplit[0].lower()])
LRtn.append(a+';'.join(LSplit[1:]))
if re: LReEscape.append((yy, a))
elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:].isdigit():
# A character number e.g. '4'
a = unichr(int(LSplit[0][1:]))
LRtn.append(a+';'.join(LSplit[1:]))
if re: LReEscape.append((yy, a))
elif LSplit[0] and LSplit[0][0] == '#' and LSplit[0][1:2].lower() == 'x' and IsHex(LSplit[0][2:]):
# A hexadecimal encoded character
a = unichr(int(LSplit[0][2:].lower(), 16)) # Hex -> base 16
LRtn.append(a+';'.join(LSplit[1:]))
if re: LReEscape.append((yy, a))
else: LRtn.append('&%s' % ';'.join(LSplit))
else: LRtn.append(iS)
xx += 1
yy += len(LRtn[-1])
return ''.join(LRtn)
def ReEscape(LReEscape, S, EscFn):
# Re-escapes the output of UnEscape to HTML, ensuring e.g.  
# is turned back again and isn't stripped at a browser level
L = []
prev = 0
for x, c in LReEscape:
if x != prev:
L.append(EscFn(S[prev:x]))
o = ord(c)
if o in codepoint2name:
L.append('&%s;' % codepoint2name[o])
else: L.append('&#%s;' % o)
prev = x+len(c)
L.append(EscFn(S[prev:]))
return ''.join(L)
def escape(value):
# Escape left over <>& tags
value = value.replace('&', '&')
value = value.replace('>', '>')
value = value.replace('<', '<')
return value
def strip_tags(value):
# Strip HTML tags
value = re.sub(r'<[^>]*?>', '', value)
print 'No Tags:', value
# Validate & references
LReEscape = []
value = UnEscape(value, LReEscape)
value = ReEscape(LReEscape, value, EscFn=escape)
print 'References Validated:', value
return value
if __name__ == '__main__':
# Outputs:
# No Tags: this is test bold text normal text >< &blah & &
# References Validated: this is test bold text normal text >< &blah & &
strip_tags('this is test <b> bold text </b> normal text >< &blah & &')
如果您想包含一些安全标签,我建议使用http://code.google.com/p/html5lib/。
请参阅http://code.google.com/p/html5lib/wiki/UserDocumentation中的“Sanitizing Tokenizer”部分。
如果这是一个重要的服务,请记得测试漏洞:http://ha.ckers.org/xss.html。
这是我项目中提取的工作代码Supybot,因此经过了相当充分的测试:
class HtmlToText(sgmllib.SGMLParser): """从c.l.p.的一些eff-bot代码中提取。""" entitydefs = htmlentitydefs.entitydefs.copy() entitydefs['nbsp'] = ' ' def __init__(self, tagReplace=' '): self.data = [] self.tagReplace = tagReplace sgmllib.SGMLParser.__init__(self)
def unknown_starttag(self, tag, attr): self.data.append(self.tagReplace)
def unknown_endtag(self, tag): self.data.append(self.tagReplace)
def handle_data(self, data): self.data.append(data)
def getText(self): text = ''.join(self.data).strip() return normalizeWhitespace(text)
def htmlToText(s, tagReplace=' '): """将HTML转换为文本。tagReplace是用于替换HTML标记的字符串。 """ x = HtmlToText(tagReplace) x.feed(s) return x.getText()
正如文档字符串所述,它起源于Fredrik Lundh而不是我。就像他们说的那样,伟大的作者会窃取 :)
from webob.exc import strip_tags
然后使用它:
print strip_tags('a<br/>b')
>> ab