def
remove_js_css (content): """ remove the the javascript and the stylesheet and the comment content (<script>....</script> and <style>....</style> <!-- xxx -->) """ r =
re.compile(r‘‘‘<script.*?</script>‘‘‘,re.I|re.M|re.S) s =
r.sub (‘‘,content) r =
re.compile(r‘‘‘<style.*?</style>‘‘‘,re.I|re.M|re.S) s =
r.sub (‘‘, s) r =
re.compile(r‘‘‘<!--.*?-->‘‘‘, re.I|re.M|re.S) s =
r.sub(‘‘,s) r =
re.compile(r‘‘‘<meta.*?>‘‘‘, re.I|re.M|re.S) s =
r.sub(‘‘,s) r =
re.compile(r‘‘‘<ins.*?</ins>‘‘‘, re.I|re.M|re.S) s =
r.sub(‘‘,s) return
s |
def
remove_empty_line (content): """remove multi space """ r =
re.compile(r‘‘‘^\s+$‘‘‘, re.M|re.S) s =
r.sub (‘‘, content) r =
re.compile(r‘‘‘\n+‘‘‘,re.M|re.S) s =
r.sub(‘\n‘,s) return
s |
def
remove_any_tag (s): s =
re.sub(r‘‘‘<[^>]+>‘‘‘,‘‘,s) return
s.strip()def
remove_any_tag_but_a (s): text =
re.findall (r‘‘‘<a[^r][^>]*>(.*?)</a>‘‘‘,s,re.I|re.S|re.S) text_b =
remove_any_tag (s) return
len(‘‘.join(text)),len(text_b)def
remove_image (s,n=50): image =
‘a‘
*
n r =
re.compile
(r‘‘‘<img.*?>‘‘‘,re.I|re.M|re.S) s =
r.sub(image,s) return
sdef
remove_video (s,n=1000): video =
‘a‘
*
n r =
re.compile
(r‘‘‘<embed.*?>‘‘‘,re.I|re.M|re.S) s =
r.sub(video,s) return
s |
原文:http://www.cnblogs.com/lizunicon/p/3516561.html