[Python] 纯文本查看 复制代码 import requests
from bs4 import BeautifulSoup
import re
import os
headers = {"referer": "https://www.mzitu.com/",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; "
"Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/95.0.4638.54 "
"Safari/537.36 Edg/95.0.1020.40"}
def getHtml(url, headers=headers):
try:
r = requests.get(url, headers=headers)
r.raise_for_status()
r.encoding ="utf-8"
return r
except BaseException:
print("获取失败")
def parserHtml(url):
herflist = []
r = getHtml(url)
soup = BeautifulSoup(r.text,"html.parser")
for herf in (i.find("a").attrs["href"] for i in soup.find("ul",id="pins")("li")):
respon = getHtml(herf)
n = re.compile(r"<span class='dots'>…</span><a href='.*?'><span>(?P<num>\d+)</span>")
num = int(n.search(respon.text).group("num"))
herflist.append((herf,num))
respon.close()
r.close()
return herflist
def downLoad(herf,num):
for i in range(num):
i+= 1
url = herf+"/"+str(i)
r = getHtml(url)
soup = BeautifulSoup(r.text,"html.parser")
if i == 1:
title = soup.find("title").string.split(sep="-")[0].strip()
os.mkdir("D:/学习/爬虫/第二章/图片/"+title)
img = soup.find("img","blur").attrs["src"]
name = img.split(sep="/")[-1]
picture = getHtml(img)
with open("图片/"+title+"/"+name,"wb") as f:
f.write(picture.content)
print(title+name+"下载完毕")
r.close()
picture.close()
def main():
url = "https://www.mzitu.com"
herflist = parserHtml(url)
for herf,num in herflist:
downLoad(herf,num)
main()
|