Maven依赖:
-
<!-- 阿里巴巴 JSON -->
-
<dependency>
-
<groupId>com.alibaba
</groupId>
-
<artifactId>fastjson
</artifactId>
-
<version>1.2.47
</version>
-
</dependency>
-
<!-- HttpClinet HTTP请求 -->
-
<dependency>
-
<groupId>org.apache.httpcomponents
</groupId>
-
<artifactId>httpclient
</artifactId>
-
<version>4.5.2
</version>
-
</dependency>
-
<!-- Jsoup 解析HTML文本 -->
-
<dependency>
-
<groupId>org.jsoup
</groupId>
-
<artifactId>jsoup
</artifactId>
-
<version>1.11.3
</version>
-
</dependency>
爬取工具类:
-
package com.zyq.tools;
-
-
import java.util.HashMap;
-
import java.util.LinkedHashMap;
-
import java.util.Map;
-
-
import org.jsoup.Jsoup;
-
import org.jsoup.nodes.Attribute;
-
import org.jsoup.nodes.Attributes;
-
import org.jsoup.nodes.Document;
-
import org.jsoup.nodes.Element;
-
import org.jsoup.select.Elements;
-
-
import com.alibaba.fastjson.JSONArray;
-
import com.alibaba.fastjson.JSONObject;
-
-
/**
-
* 文章爬取工具类
-
*
-
* @author ZhangYuanqiang
-
* @since 2020/01/04
-
*/
-
public
class SpiderUtil {
-
-
// 微信公众号文章域名
-
private
static
final String WX_DOMAIN =
"https://mp.weixin.qq.com";
-
// 文章返回前端统一key常量
-
private
static
final String KEY_TITLE =
"title";
// 文章标题
-
private
static
final String KEY_COVER_URL =
"coverLink";
// 文章封面图链接
-
private
static
final String KEY_REFER_NAME =
"referName";
// 文章出处作者
-
private
static
final String KEY_REFER_URL =
"referLink";
// 文章出处链接
-
private
static
final String KEY_TAGS =
"tags";
// 文章内容
-
private
static
final String KEY_NAME =
"name";
// 标签名称
-
private
static
final String KEY_TEXT =
"text";
// 文本信息
-
private
static
final String KEY_HREF =
"href";
// a标签链接
-
-
/**
-
* 测试主方法
-
*/
-
public static void main(String args[]) {
-
String url =
"https://mp.weixin.qq.com/s/OEjKIxTRFSY5lcNk6YIlUg";
-
Resp<JSONObject> resp = getActicle(url);
-
if (resp.isSuccess()) {
-
System.out.println(resp.getBody());
-
}
else {
-
System.out.println(resp.getMsg());
-
}
-
}
-
-
/**
-
* 根据文章链接抓取文章内容
-
*
-
* @param url 文章链接
-
* @return 文章内容
-
*/
-
public static Resp<JSONObject> getActicle(String url) {
-
// 检测链接是否合法
-
String msg = checkUrl(url);
-
if (msg !=
null) {
-
return Resp.error(msg);
-
}
-
// 请求与响应
-
String resp = HttpTool.get(url, getWxHeaderMap());
-
if (resp ==
null || resp.trim().length() ==
0) {
-
return Resp.error(
"文章获取失败,请检查链接是否正确");
-
}
-
// 解析
-
Resp<JSONObject> acticleResp = getWxActicleContent(resp, url);
-
if (acticleResp.isError()) {
-
return Resp.error(acticleResp.getMsg());
-
}
-
return acticleResp;
-
}
-
-
/**
-
* 检测文章链接是否合法
-
*/
-
public static String checkUrl(String url) {
-
if (url ==
null) {
-
return
"请输入文章链接";
-
}
-
if (!url.startsWith(WX_DOMAIN)) {
-
return
"请输入微信公众号文章链接";
-
}
-
return
null;
-
}
-
-
-
/**
-
* 微信公众号请求头设置
-
*/
-
public static Map<String, String> getWxHeaderMap() {
-
Map<String, String> map =
new HashMap<>(
new LinkedHashMap<>());
-
map.put(
"Accept",
"text/html, application/xhtml+xml, image/jxr, */*");
-
map.put(
"Accept-Encoding",
"gzip, deflate");
-
map.put(
"Accept-Language",
"zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3");
-
map.put(
"Host",
"mp.weixin.qq.com");
-
map.put(
"If-Modified-Since",
"Sat, 04 Jan 2020 12:23:43 GMT");
-
map.put(
"User-Agent",
"Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko");
-
return map;
-
}
-
-
-
/**
-
* 解析微信公众号文章
-
*
-
* @param resp 请求文章响应
-
* @param url 文章链接
-
* @return 文章信息
-
*/
-
public static Resp<JSONObject> getWxActicleContent(String resp, String url) {
-
try {
-
Document document = Jsoup.parse(resp);
-
// 文章出处(作者)
-
String referName = document.getElementsByClass(
"profile_nickname").get(
0).text();
-
// 文章封面图链接
-
String coverUrl = document.select(
"meta[property=\"og:image\"]").get(
0).attr(
"content");
-
// 文章标题
-
String title = document.getElementById(
"activity-name").text();
-
// 文章内容
-
Element content = document.getElementsByClass(
"rich_media_area_primary_inner").get(
0);
-
JSONObject json =
new JSONObject(
new LinkedHashMap<>());
-
json.put(KEY_TITLE, title);
-
json.put(KEY_COVER_URL, coverUrl);
-
json.put(KEY_REFER_NAME, referName);
-
json.put(KEY_REFER_URL, url);
-
JSONArray tags =
new JSONArray();
-
Elements sections = content.select(
"*");
-
for (Element element : sections) {
-
if (element.children().isEmpty()) {
-
getChildTag(element, tags);
-
}
-
}
-
json.put(KEY_TAGS, tags);
-
return Resp.success(json);
-
}
catch (Exception e) {
-
e.printStackTrace();
-
return Resp.error(
"文章解析失败");
-
}
-
}
-
-
public static void getChildTag(Element element, JSONArray tags) {
-
JSONObject tag =
new JSONObject(
new LinkedHashMap<>());
-
String tagName = element.tagName();
-
tag.put(KEY_NAME, tagName);
-
switch (tagName) {
-
case
"span": {
-
tag.put(KEY_TEXT, element.text());
-
tags.add(tag);
-
break;
-
}
-
case
"img": {
-
Attributes attrs = element.attributes();
-
if (attrs !=
null) {
-
for (Attribute attr : attrs) {
-
tag.put(attr.getKey().replace(
"-",
""), attr.getValue());
-
}
-
}
-
tags.add(tag);
-
break;
-
}
-
case
"a": {
-
tag.put(KEY_HREF, element.attr(
"href"));
-
tag.put(KEY_TEXT, element.attr(
"textvalue"));
-
tags.add(tag);
-
break;
-
}
-
case
"br": {
-
tags.add(tag);
-
break;
-
}
-
case
"p": {
-
tag.put(KEY_TEXT, element.text());
-
tags.add(tag);
-
break;
-
}
-
default:
-
break;
-
}
-
}
-
-
}
Httpclient工具类:
-
package com.zyq.tools;
-
-
import java.io.BufferedReader;
-
import java.io.File;
-
import java.io.FileOutputStream;
-
import java.io.IOException;
-
import java.io.InputStream;
-
import java.io.InputStreamReader;
-
import java.io.UnsupportedEncodingException;
-
import java.util.ArrayList;
-
import java.util.List;
-
import java.util.Map;
-
import java.util.Map.Entry;
-
-
import org.apache.http.HttpEntity;
-
import org.apache.http.HttpResponse;
-
import org.apache.http.client.entity.UrlEncodedFormEntity;
-
import org.apache.http.client.methods.HttpGet;
-
import org.apache.http.client.methods.HttpPost;
-
import org.apache.http.client.methods.HttpUriRequest;
-
import org.apache.http.entity.ContentType;
-
import org.apache.http.entity.mime.MultipartEntityBuilder;
-
import org.apache.http.entity.mime.content.FileBody;
-
import org.apache.http.entity.mime.content.StringBody;
-
import org.apache.http.impl.client.HttpClients;
-
import org.apache.http.message.BasicNameValuePair;
-
-
/**
-
* Httpclient工具类
-
*
-
* @author sunnyzyq
-
* @since 2019/04/22
-
*/
-
public
class HttpTool {
-
-
private
static
final
int BYTE_LEN =
102400;
// 100KB
-
private
static
final String CHARSET =
"UTF-8";
// 编码格式
-
-
/**
-
* get请求
-
* @param url 请求地址(get请求时参数自己组装到url上)
-
* @return 响应文本
-
*/
-
public static String get(String url) {
-
// 请求地址,以及参数设置
-
HttpGet get =
new HttpGet(url);
-
// 执行请求,获取相应
-
return getRespString(get);
-
}
-
-
/**
-
* get请求
-
* @param url 请求地址(get请求时参数自己组装到url上)
-
* @param headerMap 请求头
-
* @return 响应文本
-
*/
-
public static String get(String url, Map<String, String> headerMap) {
-
// 请求地址,以及参数设置
-
HttpGet get =
new HttpGet(url);
-
if (headerMap !=
null) {
-
for (Entry<String, String> entry : headerMap.entrySet()) {
-
get.setHeader(entry.getKey(), entry.getValue());
-
}
-
}
-
// 执行请求,获取相应
-
return getRespString(get);
-
}
-
-
/**
-
* post 请求
-
* @param url 请求地址
-
* @param params 请求参数
-
* @return 响应文本
-
*/
-
public static String post(String url, Map<String, String> params){
-
// 构建post请求
-
HttpPost post =
new HttpPost(url);
-
// 构建请求参数
-
List<BasicNameValuePair> pairs =
new ArrayList<BasicNameValuePair>();
-
if (params !=
null) {
-
for (Entry<String, String> entry : params.entrySet()) {
-
pairs.add(
new BasicNameValuePair(entry.getKey(), entry.getValue()));
-
}
-
}
-
HttpEntity entity =
null;
-
try {
-
entity =
new UrlEncodedFormEntity(pairs, CHARSET);
-
}
catch (UnsupportedEncodingException e) {
-
e.printStackTrace();
-
}
-
post.setEntity(entity);
-
// 执行情趣,获取相应
-
return getRespString(post);
-
}
-
-
/**
-
* 文件上传
-
* @param url 请求地址
-
* @param params 请求参数 (文件类型须为File)
-
* @return 响应文本
-
*/
-
public static String postFile(String url, Map<String, Object> params) {
-
HttpPost post =
new HttpPost(url);
-
MultipartEntityBuilder builder = MultipartEntityBuilder.create();
-
if (params !=
null) {
-
for (String key : params.keySet()) {
-
Object value = params.get(key);
-
if (value ==
null) {
-
builder.addPart(key,
new StringBody(
"",ContentType.TEXT_PLAIN));
-
continue;
-
}
-
if (value
instanceof File) {
-
builder.addPart(key,
new FileBody((File) value));
-
}
else {
-
builder.addPart(key,
new StringBody(value.toString(), ContentType.TEXT_PLAIN));
-
}
-
}
-
}
-
HttpEntity entity = builder.build();
-
post.setEntity(entity);
-
return getRespString(post);
-
}
-
-
/**
-
* 文件下载
-
*/
-
public static void getFile(String url, String name) {
-
// 图片地址
-
HttpGet get =
new HttpGet(url);
-
// 执行请求,获取响应流
-
InputStream in = getRespInputStream(get);
-
// InputStream 转 File,保存在当前工程中
-
File file =
new File(name);
-
try {
-
FileOutputStream fos =
new FileOutputStream(file);
-
byte b[] =
new
byte[BYTE_LEN];
-
int j =
0;
-
while( (j = in.read(b)) != -
1){
-
fos.write(b,
0, j);
-
}
-
fos.close();
-
}
catch (Exception e) {
-
e.printStackTrace();
-
}
-
}
-
-
/**
-
* 获取响应信息(String)
-
*/
-
public static String getRespString(HttpUriRequest request) {
-
// 获取响应流
-
InputStream in = getRespInputStream(request);
-
-
StringBuilder sb =
new StringBuilder();
-
String line;
-
-
BufferedReader br =
new BufferedReader(
new InputStreamReader(in));
-
try {
-
while ((line = br.readLine()) !=
null) {
-
sb.append(line);
-
}
-
}
catch (IOException e) {
-
e.printStackTrace();
-
}
-
String str = sb.toString();
-
return str;
-
}
-
-
/**
-
* 获取响应信息(InputStream)
-
*/
-
public static InputStream getRespInputStream(HttpUriRequest request) {
-
// 获取响应对象
-
HttpResponse response =
null;
-
try {
-
response = HttpClients.createDefault().execute(request);
-
}
catch (Exception e) {
-
e.printStackTrace();
-
}
-
if (response ==
null) {
-
return
null;
-
}
-
// 获取Entity对象
-
HttpEntity entity = response.getEntity();
-
// 获取响应信息流
-
InputStream in =
null;
-
if (entity !=
null) {
-
try {
-
in = entity.getContent();
-
}
catch (Exception e) {
-
e.printStackTrace();
-
}
-
}
-
return in;
-
}
-
}
响应工具类:
-
package com.zyq.tools;
-
-
/**
-
* 响应实体
-
* @author sunnyzyq
-
* @since 2019/04/23
-
*/
-
public
class Resp<T> {
-
-
public
static
final
int SUCCESS =
0;
-
public
static
final
int ERROR =
1;
-
-
int code = SUCCESS;
-
String msg;
-
T body;
-
-
public Resp() {}
-
-
public Resp(T t) {
-
this.body = t;
-
}
-
-
public Resp(int code, String msg, T body) {
-
this.code = code;
-
this.msg = msg;
-
this.body = body;
-
}
-
-
public
static <T>
Resp<T> error() {
-
return
new Resp<>(ERROR,
null,
null);
-
}
-
-
public
static <T>
Resp<T> error(String msg) {
-
return
new Resp<>(ERROR, msg,
null);
-
}
-
-
public
static <T>
Resp<T> error(String msg, T body) {
-
return
new Resp<>(ERROR, msg, body);
-
}
-
-
public
static <T>
Resp<T> success() {
-
return
new Resp<>(SUCCESS,
null,
null);
-
}
-
-
public
static <T>
Resp<T> success(String msg) {
-
return
new Resp<>(SUCCESS, msg,
null);
-
}
-
-
public
static <T>
Resp<T> success(T body) {
-
return
new Resp<>(SUCCESS,
"", body);
-
}
-
-
public
static <T>
Resp<T> success(String msg, T body) {
-
return
new Resp<>(SUCCESS, msg, body);
-
}
-
-
public int getCode() {
-
return code;
-
}
-
-
public void setCode(int code) {
-
this.code = code;
-
}
-
-
public String getMsg() {
-
return msg;
-
}
-
-
public void setMsg(String msg) {
-
this.msg = msg;
-
}
-
-
public void setBody(T body) {
-
this.body = body;
-
}
-
-
public T getBody() {
-
return body;
-
}
-
-
public boolean isError() {
-
return code != SUCCESS;
-
}
-
-
public boolean isSuccess() {
-
return code == SUCCESS;
-
}
-
-
@Override
-
public String toString() {
-
StringBuilder sb =
new StringBuilder();
-
sb.append(
"}");
-
sb.append(
"code:").append(code).append(
",");
-
if (msg !=
null) {
-
sb.append(
"msg:").append(msg).append(
",");
-
}
-
if (body !=
null) {
-
sb.append(
"body:").append(body.toString());
-
}
-
sb.append(
"}");
-
return sb.toString();
-
}
-
}
前端代码(参考):
HTML部分:
-
<div style="padding: 15px;">
-
<div>
-
<input type="text" name="url" placeholder=" 请输入微信公众号文章链接" style="width: 500px; height: 25px">
-
<button class="layui-btn layui-btn-sm" id="zhuquBtn">抓取文章
</button>
-
</div>
-
<hr>
-
<div id="mybox" class="fugubg" style="width: 500px">
-
</div>
-
</div>
JS部分:
-
$(
"#zhuquBtn").click(
function(){
-
$.post(
"/zhua", {
url:$(
"#myurl").val()},
function(rs){
-
if(rs.code ==
0){
-
$(
"#mybox").empty();
-
var body = rs.body;
-
var title =
'<h2>' + body.title +
'</h2>';
-
var refer =
'文章出处:<a href="' + body.referLink +
'" target="_blank"><button>' + body.referName+
'</button> </a>';
-
$(
"#mybox").append(title);
-
$(
"#mybox").append(refer);
-
$(
"#mybox").append(
'<hr/>');
-
var tags = body.tags;
-
for(
var i =
0; i < tags.length; i++) {
-
var tag = tags[i];
-
var name = tag.name;
-
if (name==
'img') {
-
if(
typeof(tag.style) ==
"undefined") {
-
$(
"#mybox").append(
'<div><img src="' + tag.datasrc +
'" style="width: 100%"><div>');
-
}
else {
-
$(
"#mybox").append(
'<div><img src="' + tag.datasrc +
'" style="'+tag.style+
'"><div>');
-
}
-
}
else
if (name==
'span' || name ==
'p') {
-
$(
"#mybox").append(
'<div>' + tag.text+
'</div>');
-
}
else
if (name==
'br') {
-
$(
"#mybox").append(
'<br/>');
-
}
else
if (name==
'a') {
-
$(
"#mybox").append(
'<div><a href="' + tag.href+
'" target="_blank"><button>' + tag.text+
'</button> </a></div>');
-
}
-
}
-
}
else {
-
layer.msg(rs.msg, {
icon:
2});
-
}
-
});
-
});
注意:在html页面加上这个meta可以解除微信图片防盗机制。
转载:https://blog.csdn.net/sunnyzyq/article/details/103837521
查看评论