|
禁止求评分、诱导评分、互刷评分、互刷悬赏值,违规者封号处理。
禁止发布推广、邀请码、邀请链接、二维码或者有利益相关的任何推广行为。
所有非原创软件请发布在【精品软件区】,发帖必须按照本版块版规格式发帖。
初学Python3,看到一个美图网站,顺手写了一个脚本,自动爬取网站套图存取MySQL数据库中。
准备工作:
1、MySQL建表,共计3个:存套图集信息、存套图集下的套图信息、存套图下的图片信息
第一个表:ttb_atlas
ID | int | 主键,自增 | AtlasName | varchar | 套图集名称 | AtlasUrl | varchar | 套图集链接地址 |
第二个表:ttb_album
ID | int | 主键,自增 | AtlasName | varchar | 套图集名称 | AlbumName | varchar | 套图名称 | AlbumUrl | varchar | 套图链接地址 | AlbumPicUrl | varchar | 套图预览照片链接 | AlbumPicDate | date | 套图日期 |
第三个表:ttb_photo
ID | int | 主键,自增 | AlbumName | varchar | 套图名称 | PhotoName | varchar | 图片名称 | PhotoUrl | varchar | 图片链接地址 |
2、创建主函数程序ttb_lk.py
- #!/usr/local/Cellar/python/3.7.1/bin
- # -*- coding: UTF-8 -*-
- import sys
- sys.path.append("/Python")
- import conf.ttb_manage as myconf
- import conf.mysql_db as mysqldb
-
- home_url = 'https://www.192td.com'
- def main():
- #第一步:将首页套图集索引插入数据库
- myconf.get_home_atlas(home_url)
-
- #第二步:获取每个套图集的每套图信息,包括:套图名称、套图链接、套图页数
- db = mysqldb.Database()
- sql = 'select * from ttb_atlas'
- results = db.fetch_all(sql)
- for row in results:
- myconf.get_atlas_album(row['AtlasName'], row['AtlasUrl'])
- print(row)
- db.close()
-
- #第三步:获取每套图的图片信息,包括:图片名称、图片链接
- db = mysqldb.Database()
- sql = 'select * from ttb_album'
- results = db.fetch_all(sql)
- for row in results:
- myconf.get_album_photo(row['AlbumName'], row['AlbumUrl'])
- db.close()
-
- if __name__ == '__main__':
- main()
复制代码 3、创建程序ttb_manage.py
- #!/usr/local/Cellar/python/3.7.1/bin
- # -*- coding: UTF-8 -*-
- import sys,requests,re,time,threading
- from bs4 import BeautifulSoup
- sys.path.append("/Python")
- import conf.mysql_db as mysqldb
-
- #======================================================
- #获取首页套图集信息
- def get_home_atlas(home_url):
- # html = open("ttb.html", "r").read()
- html = get_html(home_url)
- soup = BeautifulSoup(html, "lxml")
- db = mysqldb.Database()
- try:
- for ul in soup.find_all(class_ = 'childnav'):
- for li in ul:
- sql = "insert into ttb_atlas(AtlasName,AtlasUrl) values('%s','%s')"%(li.string,li.a['href'])
- db.execute(sql)
- except Exception:
- print(Exception)
- db.close()
- return True
-
- #获取套图集下的套图信息:1
- def get_atlas_album(AtlasName,AtlasUrl):
- # 第一步:获取套图的信息:套图页数、每页套图链接
- html = get_html(AtlasUrl)
- soup = BeautifulSoup(html, "lxml")
- #获取套图集页数
- link = soup.find('a',string='尾页').get('href')
- pages = int(re.findall('_(\d+).html',link,re.S)[0])
- #获取套图集每页的链接
- for page in range(pages,0,-1):
- if page == 1 :
- Page_Url = AtlasUrl
- print(AtlasName + ' URL:' + Page_Url)
- get_atlas_album_html(AtlasName,AtlasUrl,Page_Url)
- else:
- Page_Url = AtlasUrl +'index_' + str(page) + '.html'
- print(AtlasName + ' URL:' + Page_Url)
- get_atlas_album_html(AtlasName,AtlasUrl,Page_Url)
- time.sleep(1)
- return True
-
- #获取套图集下的套图信息:2
- def get_atlas_album_html(AtlasName,AtlasUrl,Page_Url):
- # 第二步:获取每页套图集信息:套图名称、套图链接、封面图片
- html = get_html(Page_Url)
- soup = BeautifulSoup(html, "lxml")
- db = mysqldb.Database()
- for ul in soup.find(class_='clearfix'):
- try:
- AlbumUrl = AtlasUrl + re.findall('\/(\w+.html)',ul.a['href'],re.S)[0]
- AlbumPicDate = ul.b.string
- AlbumName = ul.span.string
- AlbumPicUrl = ul.img['lazysrc']
- sql = "insert into ttb_album(AtlasName,AlbumName,AlbumUrl,AlbumPicUrl,AlbumPicDate) values('%s','%s','%s','%s','%s')"%(AtlasName,AlbumName,AlbumUrl,AlbumPicUrl,AlbumPicDate)
- db.execute(sql)
- print(AlbumName + ' URL:' + AlbumUrl + ' 插入成功!')
- except Exception:
- print(Exception)
- db.close()
- return True
-
- #获取套图下的每套图片信息
- def get_album_photo(AlbumName,AlbumUrl):
- html = fread('ttb.html')
- soup = BeautifulSoup(html, "lxml")
-
- #获取第一页的图片信息与套图页数,将第一页信息插入数据库
- PhotoName = soup.img['alt']
- PhotoUrl = soup.img['lazysrc']
- PhtoNum = soup.find('span', id='allnum').get_text()
-
- db = mysqldb.Database()
- sql = "insert into ttb_photo(AlbumName,PhotoName,PhotoUrl) values('%s','%s','%s')" % (AlbumName, PhotoName, PhotoUrl)
- db.execute(sql)
- print("第1张:" + PhotoName + ' URL:' +PhotoUrl)
- db.close()
- # 获取后面页数的图片信息,插入数据库
- for i in range(2, int(PhtoNum) + 1):
- url = AlbumUrl[:-5] + "_" + format(i) + ".html"
- th = threading.Thread(target=get_img_insert, args=(i,AlbumName,url))
- # ts.append(th)
- th.start()
- time.sleep(0.5)
- return True
-
- #插入图片信息
- def get_img_insert(i,AlbumName,url):
- html = get_html(url)
- # html = fread('ttb.html')
- soup = BeautifulSoup(html, "lxml")
- PhotoName = soup.img['alt']
- PhotoUrl = soup.img['lazysrc']
- db = mysqldb.Database()
- print("第"+ format(i) +"张:" + PhotoName + ' URL:' +PhotoUrl)
- sql = "insert into ttb_photo(AlbumName,PhotoName,PhotoUrl) values('%s','%s','%s')"%(AlbumName,PhotoName,PhotoUrl)
- db.execute(sql)
- db.close()
- return
-
-
- #======================================================
- #获取网页信息,得到的html就是网页的源代码,传url,返回html
- def get_html(url):
- headers = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
- 'Accept - Encoding': 'gzip, deflate, br',
- 'Accept-Language': 'zh-CN,zh;q=0.9',
- 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36',
- }
- resp = requests.get(url,headers=headers)
- resp.encoding='utf-8'
- html = resp.text
- return html
复制代码 4、创建程序mysql_db.py(注意修改mysql连接的参数)
- #!/usr/local/Cellar/python/3.7.1/bin
- # -*- coding: UTF-8 -*-
- import mysql.connector
- import logging
-
- # 加入日志
- # 获取logger实例
- logger = logging.getLogger("dbSql")
- # 指定输出格式
- formatter = logging.Formatter('%(asctime)s%(levelname)-8s:%(message)s')
-
- #数据库操作类
- class Database:
- # 构造函数
- def __init__(self):
- self._dbhost = 'localhost' # 数据库主机地址
- self._dbuser = 'root' # 数据库用户名
- self._dbpassword = 'root' # 数据库密码
- self._dbname = 'lk' # 数据库名称
- self._dbcharset = 'utf8' # 数据库编码
- self._conn = self.connectMysql()
- if (self._conn):
- self._cursor = self._conn.cursor()
-
- # 数据库连接
- def connectMysql(self):
- conn =False
- try:
- # self._conn = mysql.connector.connect(
- conn = mysql.connector.connect(
- host=self._dbhost,
- user=self._dbuser,
- passwd=self._dbpassword,
- database=self._dbname,
- charset=self._dbcharset,
- )
- except Exception:
- # self._logger.error("connect database failed, %s" % data)
- logger.error("connect database failed!")
- conn =False
- # self._cursor = self._conn.cursor()
- return conn
-
- # 直接执行SQL语句
- def execute(self, sql):
- flag = False
- if (self._conn):
- try:
- self._cursor.execute(sql)
- self._conn.commit()
- flag = True
- except Exception:
- flag = False
- logger.warning("update database exception SQL=" + sql)
- return flag
-
-
- # 查询所有数据,带字段名
- def fetch_all(self, sql):
- result = ''
- if (self._conn):
- try:
- self._cursor = self._conn.cursor(dictionary=True)
- self._cursor.execute(sql)
- result = self._cursor.fetchall()
- except Exception:
- result = False
- logger.warning("query database exception SQL=" + sql)
- return result
-
- # 查询所有数据,不带字段名
- def fetchall(self, sql):
- result = ''
- if (self._conn):
- try:
- self._cursor.execute(sql)
- result = self._cursor.fetchall()
- except Exception:
- result = False
- logger.warning("query database exception SQL=" + sql)
- return result
-
- # 查询一条数据,带字段名
- def fetch_one(self, sql):
- result = ''
- if (self._conn):
- try:
- self._cursor = self._conn.cursor(dictionary=True)
- self._cursor.execute(sql)
- result = self._cursor.fetchone()
- except Exception:
- result = False
- logger.warning("query database exception SQL=" + sql)
- return result
-
- # 查询一条数据,不带字段名
- def fetchone(self, sql):
- result = ''
- if (self._conn):
- try:
- self._cursor.execute(sql)
- result = self._cursor.fetchone()
- except Exception:
- result = False
- logger.warning("query database exception SQL=" + sql)
- return result
-
- # 关闭数据库连接
- def close(self):
- # 如果数据打开,则关闭;否则没有操作
- if (self._conn):
- try:
- if (type(self._cursor) == 'object'):
- self._cursor.close()
- if (type(self._conn) == 'object'):
- self._conn.close()
- except Exception:
- # self._logger.warn("close database exception, %s,%s,%s" % (data, type(self._cursor), type(self._conn)))
- logger.warning("close database exception,%s,%s" % ( type(self._cursor), type(self._conn)))
- return True
复制代码
|
|