Python学习之爬虫案例

Python学习之爬虫案例

– 直接上代码

python引入模块

1
2
import re
from urllib.request import urlopen

获取网页内容并返回

1
2
3
def getPage(url):
response = urlopen(url)
return response.read().decode('utf-8')

正则获取需要的内容

1
2
3
4
5
6
def parseHtml(s):
ret = re.findall(
'<li>.*?<span class="view">\s\W+(?P<vcount>\d+).*?<em>.*?</em>.*?</span>'
'.*?<div class="a1">.*?<a href="(?P<href>.*?)">(?P<title>.*?)</a>.*?</div>.*?</li>', s, re.S
)
return ret

内容保存

1
2
3
4
5
6
def writefile(dbs):
f = open(r'new.txt', "w+",encoding="utf-8")
for i in dbs:
print(i[0]+i[1]+i[2])
f.writelines(i[0]+i[1]+i[2] + '\n')
f.close()

启动调用方法

1
2
3
4
5
6
def main():
url = "http://www.yichang100.com/"
respones_html = getPage(url)
ret = parseHtml(respones_html)
writefile(ret)
main()