仅供学习使用~
环境,需要Java环境
需要Jsoup依赖
主代码:- package com.cn.st.site.ink.jav;
-
- import com.cn.st.util.DownloadImage;
- import org.jsoup.Jsoup;
- import org.jsoup.nodes.Document;
- import org.jsoup.nodes.Element;
- import org.jsoup.select.Elements;
-
- import java.io.IOException;
- import java.util.ArrayList;
- import java.util.List;
-
- /**
- * @Author: XXX-HR
- * @Date: 2020/8/21 17:58
- * @Desc: 目标地址:https://www.jav.ink/
- * @tips: 这个网站是个小黄站~~!
- */
- public class Spider {
-
- /**
- * 总页数
- */
- final static Integer TOTAL = 1258;
-
- final static String URL = "https://www.jav.ink/page/";
-
- final static String savePath = "D:/image/";
-
-
- public static void main(String[] args) throws IOException {
- for (int i = 1; i < TOTAL; i++) {
- System.out.println("准备下载第:" + i+"页");
- getPage(URL+"/");
- System.out.println("第:" + i+"页抓取完毕、");
- }
-
- }
-
- private static void getPage(String pageAddress) throws IOException {
- //解析单页
- Document document = Jsoup.connect(URL)
- .timeout(2001 * 1000)
- .ignoreContentType(true)
- .ignoreHttpErrors(true)
- .cookie("cookie","__cfduid=d2c7c3594bb4f5b3cd5517f9c6749c3e51598003707; HstCfa4208213=1598003714265; HstCmu4208213=1598003714265; HstCnv4208213=2; HstCns4208213=2; HstCla4208213=1598008749837; HstPn4208213=9; HstPt4208213=10")
- // .cookie()
- .header("user-agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36")
- .get();
- System.out.println("抓取成功...");
- Elements uls = document.select("#infinite-articles");
- List<Image> imageList = new ArrayList<>();
-
- for (Element li : uls.select("li")) {
- Elements a_img = li.select("a img");
- String img_url = a_img.attr("src");
- String otherImg = a_img.attr("srcset");
- String altName = a_img.attr("alt");
- Image image = Image.getImage(altName, img_url, otherImg.trim());
- imageList.add(image);
- down10Images(imageList);
- }
- System.out.println("下载了一页....");
- }
-
- public static void down10Images(List<Image> imageList) {
- if (imageList.size() > 10) {
- System.out.println("开始下载文件。。");
- downImages(imageList);
- } else {
- return;
- }
- System.out.println("下载了10个...");
- }
-
- private static void downImages(List<Image> imageList) {
- for (Image i : imageList) {
- try {
- DownloadImage.download(i.getUrl(), i.getName()+".jpg", savePath + i.getName() + "/");
- if (i.getOthers().size() > 0) {
- for (String otherUrls : i.getOthers()) {
- DownloadImage.download(otherUrls, i.getName()+".jpg", savePath + i.getName() + "/");
- }
- }
- } catch (Exception e) {
- continue;
- }
- }
- imageList.clear();
- }
-
- }
复制代码
- package com.cn.st.util;
-
- import java.io.File;
- import java.io.FileOutputStream;
- import java.io.InputStream;
- import java.io.OutputStream;
- import java.net.HttpURLConnection;
- import java.net.URL;
- import java.util.concurrent.TimeUnit;
-
-
- /**
- * 图片下载工具类、
- */
- public class DownloadImage {
-
- /**
- * 下载网络文件到本地、
- *
- * [url=home.php?mod=space&uid=952169]@Param[/url] urlString 网络文件全路径
- * @param filename 文件再本地的名字
- * @param savePath 本地文件路径
- * @throws Exception
- */
- public static void download(String urlString, String filename, String savePath) throws Exception {
- TimeUnit.SECONDS.sleep(2);
- // 构造URL
- URL url = new URL(urlString);
- // 打开连接
- HttpURLConnection con = (HttpURLConnection) url.openConnection();
- //设置请求超时为50s
- con.setConnectTimeout(500 * 1000);
- con.setRequestMethod("GET");
- //防止屏蔽程序抓取而返回403错误
- con.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36");
- con.setDoOutput(true);
- con.setDoInput(true);
- // 输入流
- InputStream is = con.getInputStream();
-
- // 1K的数据缓冲
- byte[] bs = new byte[1024];
- // 读取到的数据长度
- int len;
- // 输出的文件流
- File sf = new File(savePath);
- if (!sf.exists()) {
- sf.mkdirs();
- }
- OutputStream os = new FileOutputStream(sf.getPath() + "\" + System.currentTimeMillis() + "---" + filename);
- // 开始读取
- while ((len = is.read(bs)) != -1) {
- os.write(bs, 0, len);
- }
- os.close();
- is.close();
- }
- }
复制代码
|