使用python去除HTML中標籤的幾種

待刪除HTML示例標籤如下:

In [96]: test  Out[96]: '<p>just for test</p><br/><font>just for test</font><b>test</b>'

方法1:

In [97]: str_ = ''      ...: flag = 1      ...: for ele in test:      ...:     if ele == "<":      ...:         flag = 0      ...:     elif ele == '>':      ...:         flag = 1      ...:         continue      ...:     if flag == 1:      ...:         str_ += ele      ...:    In [98]: str_  Out[98]: 'just for testjust for testtest'    In [99]: str_ = ''      ...: flag = 1      ...: for ele in test:      ...:     if ele == "<":      ...:         flag = 0      ...:     elif ele == '>':      ...:         flag = 1      ...:         ele = ' '      ...:     if flag == 1:      ...:         str_ += ele      ...:    In [100]: str_  Out[100]: ' just for test   just for test  test '

方法2:

import re
In [156]: pat = re.compile('(?<=>).*?(?=<)')    In [157]: pat.findall(test)  Out[157]: ['just for test', '', '', 'just for test', '', 'test']    In [158]: ''.join(pat.findall(test))  Out[158]: 'just for testjust for testtest'

方法3:

pat = re.compile('>(.*?)<')  ''.join(pat.findall(test))

方法4:

In [167]: pat = re.compile('<[^>]+>', re.S)    In [168]: pat.sub('', test)  Out[168]: 'just for testjust for testtest'