知乎用户头像简单爬虫



   写作缘由:带逛传万世 因有我参与 - 知乎专栏

   首先感谢 grapeot 提供的实验数据

说明

  • 本文基于autoface网站,通过Java爬虫将网站中的知乎用户头像爬下并本地化存储

  • 使用Jsoup解析网页

  • 使用单线程爬取

  • 头像保存在项目目录下的zhihupics文件夹

思路

这是一个很简单的测试案例,通过解析autoface网站的源文件

autoface_source

每一行对应一个用户的知乎个人首页网址

sample

通过Jsoup遍历所有的地址节点,进入到每一个用户的知乎个人主页,观察源码

zhihu_user_profile

找到用户头像的网络存储的位置,遍历所有用户的头像的地址,并使用I/O流将图片下载到本地,并将头像顺序编号

images_file

保存本地的预览图如下:

1
2
3
4
5
6
7

Code

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
java
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Map.Entry;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
/**
* @author RunDouble
*
*/
public class PictureInZhihu {
private InputStream is;
private BufferedReader bufferedReader;
private String tempString = "";
private StringBuffer webStringBuffer;
private String webString = "";
public String getWbebString(String webUrl) {
try {
if (webUrl != null) {
URL url = new URL(webUrl);
HttpURLConnection connection = (HttpURLConnection)url.openConnection();
connection.getContent();
is = connection.getInputStream();
bufferedReader = new BufferedReader(new InputStreamReader(is));
webStringBuffer = new StringBuffer("");
while ((tempString = bufferedReader.readLine()) != null) {
webStringBuffer.append(tempString);
}
webString = webStringBuffer.toString();
}
} catch (Exception e) {
e.printStackTrace();
}
return webString;
}
public static HashMap<String, String> getUsersUrlAndprofileUrl(String webUrl) {
Document document;
String addressLinkHref;
String profileLinkSrc = "";
HashMap<String, String> urlWithProfile = new HashMap<>();
try {
document = Jsoup.connect(webUrl).get();
Elements links = document.select("a[href]");
for (int i = 0; i < links.size(); i++) {
addressLinkHref = links.get(i).attr("href");
try {
document = Jsoup.connect(addressLinkHref).get();
} catch (Exception e) {
i++;
continue;
}
Elements jpgs = document.select("img[src$=.jpg]");
if (jpgs.size() != 0) {
profileLinkSrc = jpgs.get(0).attr("src");
}
System.out.println(addressLinkHref);
System.out.println(profileLinkSrc + "\n\n");
if (!profileLinkSrc.equals("")&& profileLinkSrc.contains("http")) {
urlWithProfile.put(addressLinkHref, profileLinkSrc);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return urlWithProfile;
}
public static void picturePersistence(HashMap<String, String> info) {
Iterator<Entry<String, String>> it = info.entrySet().iterator();
int index = 1;
while (it.hasNext()) {
Map.Entry entry = (Map.Entry) it.next();
String tempPicUrl = entry.getValue().toString();
try {
URL url = new URL(tempPicUrl);
HttpURLConnection httpURLConnection = (HttpURLConnection) url.openConnection();
httpURLConnection.setReadTimeout(50000);
InputStream inputStream = httpURLConnection.getInputStream();
String newImageName = String.valueOf(index) + ".jpg";
File file = new File("zhihupics");
if (!file.exists()) {
file.mkdir();
}
FileOutputStream fos = new FileOutputStream(file.getPath()+ "\\" + newImageName);
byte[] buffer = new byte[1024];
int length;
while ((length = inputStream.read(buffer)) > 0) {
fos.write(buffer, 0, length);
}
index++;
} catch (Exception e) {
continue;
}
}
}
/*
* just a test
*/
public static void main(String[] args) {
double start = System.currentTimeMillis();
PictureInZhihu.picturePersistence(PictureInZhihu.getUsersUrlAndprofileUrl("http://lab.grapeot.me/zhihu/autoface"));
double end = System.currentTimeMillis();
double totalTime = ((end - start) / 1000) / 60;
System.out.println("Total time:" + totalTime + "miniutes.");
}
}

总结
   这是最基本的爬虫,没有用到多线程,也没有用到模拟登录,更没有用到IP代理。重在乐趣。