用python快速过滤html指定标签函数
"""
@author: MR.N
@created: 2022/3/30 Wed.
@version: 1.0
"""
import io
import re
def filter_html_tags(text):
htmltags = ['div', 'ul', 'li', 'ol', 'p', 'span', 'form', 'br',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
'hr', 'input',
'title', 'table', 'tbody', 'a',
'i', 'strong', 'b', 'big', 'small', 'u', 's', 'strike',
'img', 'center', 'dl', 'dt', 'font', 'em',
'code', 'pre', 'link', 'meta', 'iframe', 'ins']
blocktags = ['script', 'style']
tabletags = ['tr', 'th', 'td']
for tag in htmltags:
# filter html tag with its attribute descriptions
text = re.sub(f'<{tag}[^<>]*[/]?>', '', text)
text = re.sub(f'</{tag}>', '', text)
# '''
for block in blocktags:
re_block = re.compile('<\s*{block}[^>]*>[\S\s]*?<\s*/\s*{block}\s*>',re.I)#script
text = re_block.sub('',text) #
buffer = io.StringIO(text)
text = ''
line = buffer.readline()
while line is not None and line != '':
for tag in tabletags:
if '<' + tag in line or '</' + tag in line:
if len(line) < 2:
# len('\n') == 1
if ascii(line) == '\\n':
line = ''
while '\n' in line:
line = line.replace('\n', '')
line = re.sub(f'<{tag}[^<>]*[/]?>', '', line)
line = re.sub(f'</{tag}>', '', line)
# filter multiple spaces
line = line.replace(' ', '')
text += line
line = buffer.readline()
# '''
# filter multiple empty lines
while '\n\n' in text:
text = text.replace("\n\n", '\n')
return text
发表评论 取消回复