飞道的博客

Java 爬取微信公众号文章(文字 + 图片)

248人阅读  评论(0)

Maven依赖:


  
  1. <!-- 阿里巴巴 JSON -->
  2. <dependency>
  3. <groupId>com.alibaba </groupId>
  4. <artifactId>fastjson </artifactId>
  5. <version>1.2.47 </version>
  6. </dependency>
  7. <!-- HttpClinet HTTP请求 -->
  8. <dependency>
  9. <groupId>org.apache.httpcomponents </groupId>
  10. <artifactId>httpclient </artifactId>
  11. <version>4.5.2 </version>
  12. </dependency>
  13. <!-- Jsoup 解析HTML文本 -->
  14. <dependency>
  15. <groupId>org.jsoup </groupId>
  16. <artifactId>jsoup </artifactId>
  17. <version>1.11.3 </version>
  18. </dependency>

线上测试点这里

爬取工具类:


  
  1. package com.zyq.tools;
  2. import java.util.HashMap;
  3. import java.util.LinkedHashMap;
  4. import java.util.Map;
  5. import org.jsoup.Jsoup;
  6. import org.jsoup.nodes.Attribute;
  7. import org.jsoup.nodes.Attributes;
  8. import org.jsoup.nodes.Document;
  9. import org.jsoup.nodes.Element;
  10. import org.jsoup.select.Elements;
  11. import com.alibaba.fastjson.JSONArray;
  12. import com.alibaba.fastjson.JSONObject;
  13. /**
  14. * 文章爬取工具类
  15. *
  16. * @author ZhangYuanqiang
  17. * @since 2020/01/04
  18. */
  19. public class SpiderUtil {
  20. // 微信公众号文章域名
  21. private static final String WX_DOMAIN = "https://mp.weixin.qq.com";
  22. // 文章返回前端统一key常量
  23. private static final String KEY_TITLE = "title"; // 文章标题
  24. private static final String KEY_COVER_URL = "coverLink"; // 文章封面图链接
  25. private static final String KEY_REFER_NAME = "referName"; // 文章出处作者
  26. private static final String KEY_REFER_URL = "referLink"; // 文章出处链接
  27. private static final String KEY_TAGS = "tags"; // 文章内容
  28. private static final String KEY_NAME = "name"; // 标签名称
  29. private static final String KEY_TEXT = "text"; // 文本信息
  30. private static final String KEY_HREF = "href"; // a标签链接
  31. /**
  32. * 测试主方法
  33. */
  34. public static void main(String args[]) {
  35. String url = "https://mp.weixin.qq.com/s/OEjKIxTRFSY5lcNk6YIlUg";
  36. Resp<JSONObject> resp = getActicle(url);
  37. if (resp.isSuccess()) {
  38. System.out.println(resp.getBody());
  39. } else {
  40. System.out.println(resp.getMsg());
  41. }
  42. }
  43. /**
  44. * 根据文章链接抓取文章内容
  45. *
  46. * @param url 文章链接
  47. * @return 文章内容
  48. */
  49. public static Resp<JSONObject> getActicle(String url) {
  50. // 检测链接是否合法
  51. String msg = checkUrl(url);
  52. if (msg != null) {
  53. return Resp.error(msg);
  54. }
  55. // 请求与响应
  56. String resp = HttpTool.get(url, getWxHeaderMap());
  57. if (resp == null || resp.trim().length() == 0) {
  58. return Resp.error( "文章获取失败,请检查链接是否正确");
  59. }
  60. // 解析
  61. Resp<JSONObject> acticleResp = getWxActicleContent(resp, url);
  62. if (acticleResp.isError()) {
  63. return Resp.error(acticleResp.getMsg());
  64. }
  65. return acticleResp;
  66. }
  67. /**
  68. * 检测文章链接是否合法
  69. */
  70. public static String checkUrl(String url) {
  71. if (url == null) {
  72. return "请输入文章链接";
  73. }
  74. if (!url.startsWith(WX_DOMAIN)) {
  75. return "请输入微信公众号文章链接";
  76. }
  77. return null;
  78. }
  79. /**
  80. * 微信公众号请求头设置
  81. */
  82. public static Map<String, String> getWxHeaderMap() {
  83. Map<String, String> map = new HashMap<>( new LinkedHashMap<>());
  84. map.put( "Accept", "text/html, application/xhtml+xml, image/jxr, */*");
  85. map.put( "Accept-Encoding", "gzip, deflate");
  86. map.put( "Accept-Language", "zh-Hans-CN, zh-Hans; q=0.8, en-US; q=0.5, en; q=0.3");
  87. map.put( "Host", "mp.weixin.qq.com");
  88. map.put( "If-Modified-Since", "Sat, 04 Jan 2020 12:23:43 GMT");
  89. map.put( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko");
  90. return map;
  91. }
  92. /**
  93. * 解析微信公众号文章
  94. *
  95. * @param resp 请求文章响应
  96. * @param url 文章链接
  97. * @return 文章信息
  98. */
  99. public static Resp<JSONObject> getWxActicleContent(String resp, String url) {
  100. try {
  101. Document document = Jsoup.parse(resp);
  102. // 文章出处(作者)
  103. String referName = document.getElementsByClass( "profile_nickname").get( 0).text();
  104. // 文章封面图链接
  105. String coverUrl = document.select( "meta[property=\"og:image\"]").get( 0).attr( "content");
  106. // 文章标题
  107. String title = document.getElementById( "activity-name").text();
  108. // 文章内容
  109. Element content = document.getElementsByClass( "rich_media_area_primary_inner").get( 0);
  110. JSONObject json = new JSONObject( new LinkedHashMap<>());
  111. json.put(KEY_TITLE, title);
  112. json.put(KEY_COVER_URL, coverUrl);
  113. json.put(KEY_REFER_NAME, referName);
  114. json.put(KEY_REFER_URL, url);
  115. JSONArray tags = new JSONArray();
  116. Elements sections = content.select( "*");
  117. for (Element element : sections) {
  118. if (element.children().isEmpty()) {
  119. getChildTag(element, tags);
  120. }
  121. }
  122. json.put(KEY_TAGS, tags);
  123. return Resp.success(json);
  124. } catch (Exception e) {
  125. e.printStackTrace();
  126. return Resp.error( "文章解析失败");
  127. }
  128. }
  129. public static void getChildTag(Element element, JSONArray tags) {
  130. JSONObject tag = new JSONObject( new LinkedHashMap<>());
  131. String tagName = element.tagName();
  132. tag.put(KEY_NAME, tagName);
  133. switch (tagName) {
  134. case "span": {
  135. tag.put(KEY_TEXT, element.text());
  136. tags.add(tag);
  137. break;
  138. }
  139. case "img": {
  140. Attributes attrs = element.attributes();
  141. if (attrs != null) {
  142. for (Attribute attr : attrs) {
  143. tag.put(attr.getKey().replace( "-", ""), attr.getValue());
  144. }
  145. }
  146. tags.add(tag);
  147. break;
  148. }
  149. case "a": {
  150. tag.put(KEY_HREF, element.attr( "href"));
  151. tag.put(KEY_TEXT, element.attr( "textvalue"));
  152. tags.add(tag);
  153. break;
  154. }
  155. case "br": {
  156. tags.add(tag);
  157. break;
  158. }
  159. case "p": {
  160. tag.put(KEY_TEXT, element.text());
  161. tags.add(tag);
  162. break;
  163. }
  164. default:
  165. break;
  166. }
  167. }
  168. }

Httpclient工具类:


  
  1. package com.zyq.tools;
  2. import java.io.BufferedReader;
  3. import java.io.File;
  4. import java.io.FileOutputStream;
  5. import java.io.IOException;
  6. import java.io.InputStream;
  7. import java.io.InputStreamReader;
  8. import java.io.UnsupportedEncodingException;
  9. import java.util.ArrayList;
  10. import java.util.List;
  11. import java.util.Map;
  12. import java.util.Map.Entry;
  13. import org.apache.http.HttpEntity;
  14. import org.apache.http.HttpResponse;
  15. import org.apache.http.client.entity.UrlEncodedFormEntity;
  16. import org.apache.http.client.methods.HttpGet;
  17. import org.apache.http.client.methods.HttpPost;
  18. import org.apache.http.client.methods.HttpUriRequest;
  19. import org.apache.http.entity.ContentType;
  20. import org.apache.http.entity.mime.MultipartEntityBuilder;
  21. import org.apache.http.entity.mime.content.FileBody;
  22. import org.apache.http.entity.mime.content.StringBody;
  23. import org.apache.http.impl.client.HttpClients;
  24. import org.apache.http.message.BasicNameValuePair;
  25. /**
  26. * Httpclient工具类
  27. *
  28. * @author sunnyzyq
  29. * @since 2019/04/22
  30. */
  31. public class HttpTool {
  32. private static final int BYTE_LEN = 102400; // 100KB
  33. private static final String CHARSET = "UTF-8"; // 编码格式
  34. /**
  35. * get请求
  36. * @param url 请求地址(get请求时参数自己组装到url上)
  37. * @return 响应文本
  38. */
  39. public static String get(String url) {
  40. // 请求地址,以及参数设置
  41. HttpGet get = new HttpGet(url);
  42. // 执行请求,获取相应
  43. return getRespString(get);
  44. }
  45. /**
  46. * get请求
  47. * @param url 请求地址(get请求时参数自己组装到url上)
  48. * @param headerMap 请求头
  49. * @return 响应文本
  50. */
  51. public static String get(String url, Map<String, String> headerMap) {
  52. // 请求地址,以及参数设置
  53. HttpGet get = new HttpGet(url);
  54. if (headerMap != null) {
  55. for (Entry<String, String> entry : headerMap.entrySet()) {
  56. get.setHeader(entry.getKey(), entry.getValue());
  57. }
  58. }
  59. // 执行请求,获取相应
  60. return getRespString(get);
  61. }
  62. /**
  63. * post 请求
  64. * @param url 请求地址
  65. * @param params 请求参数
  66. * @return 响应文本
  67. */
  68. public static String post(String url, Map<String, String> params){
  69. // 构建post请求
  70. HttpPost post = new HttpPost(url);
  71. // 构建请求参数
  72. List<BasicNameValuePair> pairs = new ArrayList<BasicNameValuePair>();
  73. if (params != null) {
  74. for (Entry<String, String> entry : params.entrySet()) {
  75. pairs.add( new BasicNameValuePair(entry.getKey(), entry.getValue()));
  76. }
  77. }
  78. HttpEntity entity = null;
  79. try {
  80. entity = new UrlEncodedFormEntity(pairs, CHARSET);
  81. } catch (UnsupportedEncodingException e) {
  82. e.printStackTrace();
  83. }
  84. post.setEntity(entity);
  85. // 执行情趣,获取相应
  86. return getRespString(post);
  87. }
  88. /**
  89. * 文件上传
  90. * @param url 请求地址
  91. * @param params 请求参数 (文件类型须为File)
  92. * @return 响应文本
  93. */
  94. public static String postFile(String url, Map<String, Object> params) {
  95. HttpPost post = new HttpPost(url);
  96. MultipartEntityBuilder builder = MultipartEntityBuilder.create();
  97. if (params != null) {
  98. for (String key : params.keySet()) {
  99. Object value = params.get(key);
  100. if (value == null) {
  101. builder.addPart(key, new StringBody( "",ContentType.TEXT_PLAIN));
  102. continue;
  103. }
  104. if (value instanceof File) {
  105. builder.addPart(key, new FileBody((File) value));
  106. } else {
  107. builder.addPart(key, new StringBody(value.toString(), ContentType.TEXT_PLAIN));
  108. }
  109. }
  110. }
  111. HttpEntity entity = builder.build();
  112. post.setEntity(entity);
  113. return getRespString(post);
  114. }
  115. /**
  116. * 文件下载
  117. */
  118. public static void getFile(String url, String name) {
  119. // 图片地址
  120. HttpGet get = new HttpGet(url);
  121. // 执行请求,获取响应流
  122. InputStream in = getRespInputStream(get);
  123. // InputStream 转 File,保存在当前工程中
  124. File file = new File(name);
  125. try {
  126. FileOutputStream fos = new FileOutputStream(file);
  127. byte b[] = new byte[BYTE_LEN];
  128. int j = 0;
  129. while( (j = in.read(b)) != - 1){
  130. fos.write(b, 0, j);
  131. }
  132. fos.close();
  133. } catch (Exception e) {
  134. e.printStackTrace();
  135. }
  136. }
  137. /**
  138. * 获取响应信息(String)
  139. */
  140. public static String getRespString(HttpUriRequest request) {
  141. // 获取响应流
  142. InputStream in = getRespInputStream(request);
  143. StringBuilder sb = new StringBuilder();
  144. String line;
  145. BufferedReader br = new BufferedReader( new InputStreamReader(in));
  146. try {
  147. while ((line = br.readLine()) != null) {
  148. sb.append(line);
  149. }
  150. } catch (IOException e) {
  151. e.printStackTrace();
  152. }
  153. String str = sb.toString();
  154. return str;
  155. }
  156. /**
  157. * 获取响应信息(InputStream)
  158. */
  159. public static InputStream getRespInputStream(HttpUriRequest request) {
  160. // 获取响应对象
  161. HttpResponse response = null;
  162. try {
  163. response = HttpClients.createDefault().execute(request);
  164. } catch (Exception e) {
  165. e.printStackTrace();
  166. }
  167. if (response == null) {
  168. return null;
  169. }
  170. // 获取Entity对象
  171. HttpEntity entity = response.getEntity();
  172. // 获取响应信息流
  173. InputStream in = null;
  174. if (entity != null) {
  175. try {
  176. in = entity.getContent();
  177. } catch (Exception e) {
  178. e.printStackTrace();
  179. }
  180. }
  181. return in;
  182. }
  183. }

响应工具类:


  
  1. package com.zyq.tools;
  2. /**
  3. * 响应实体
  4. * @author sunnyzyq
  5. * @since 2019/04/23
  6. */
  7. public class Resp<T> {
  8. public static final int SUCCESS = 0;
  9. public static final int ERROR = 1;
  10. int code = SUCCESS;
  11. String msg;
  12. T body;
  13. public Resp() {}
  14. public Resp(T t) {
  15. this.body = t;
  16. }
  17. public Resp(int code, String msg, T body) {
  18. this.code = code;
  19. this.msg = msg;
  20. this.body = body;
  21. }
  22. public static <T> Resp<T> error() {
  23. return new Resp<>(ERROR, null, null);
  24. }
  25. public static <T> Resp<T> error(String msg) {
  26. return new Resp<>(ERROR, msg, null);
  27. }
  28. public static <T> Resp<T> error(String msg, T body) {
  29. return new Resp<>(ERROR, msg, body);
  30. }
  31. public static <T> Resp<T> success() {
  32. return new Resp<>(SUCCESS, null, null);
  33. }
  34. public static <T> Resp<T> success(String msg) {
  35. return new Resp<>(SUCCESS, msg, null);
  36. }
  37. public static <T> Resp<T> success(T body) {
  38. return new Resp<>(SUCCESS, "", body);
  39. }
  40. public static <T> Resp<T> success(String msg, T body) {
  41. return new Resp<>(SUCCESS, msg, body);
  42. }
  43. public int getCode() {
  44. return code;
  45. }
  46. public void setCode(int code) {
  47. this.code = code;
  48. }
  49. public String getMsg() {
  50. return msg;
  51. }
  52. public void setMsg(String msg) {
  53. this.msg = msg;
  54. }
  55. public void setBody(T body) {
  56. this.body = body;
  57. }
  58. public T getBody() {
  59. return body;
  60. }
  61. public boolean isError() {
  62. return code != SUCCESS;
  63. }
  64. public boolean isSuccess() {
  65. return code == SUCCESS;
  66. }
  67. @Override
  68. public String toString() {
  69. StringBuilder sb = new StringBuilder();
  70. sb.append( "}");
  71. sb.append( "code:").append(code).append( ",");
  72. if (msg != null) {
  73. sb.append( "msg:").append(msg).append( ",");
  74. }
  75. if (body != null) {
  76. sb.append( "body:").append(body.toString());
  77. }
  78. sb.append( "}");
  79. return sb.toString();
  80. }
  81. }

前端代码(参考):

HTML部分:


  
  1. <div style="padding: 15px;">
  2. <div>
  3. <input type="text" name="url" placeholder=" 请输入微信公众号文章链接" style="width: 500px; height: 25px">
  4. <button class="layui-btn layui-btn-sm" id="zhuquBtn">抓取文章 </button>
  5. </div>
  6. <hr>
  7. <div id="mybox" class="fugubg" style="width: 500px">
  8. </div>
  9. </div>

JS部分:


  
  1. $( "#zhuquBtn").click( function(){
  2. $.post( "/zhua", { url:$( "#myurl").val()}, function(rs){
  3. if(rs.code == 0){
  4. $( "#mybox").empty();
  5. var body = rs.body;
  6. var title = '<h2>' + body.title + '</h2>';
  7. var refer = '文章出处:<a href="' + body.referLink + '" target="_blank"><button>' + body.referName+ '</button> </a>';
  8. $( "#mybox").append(title);
  9. $( "#mybox").append(refer);
  10. $( "#mybox").append( '<hr/>');
  11. var tags = body.tags;
  12. for( var i = 0; i < tags.length; i++) {
  13. var tag = tags[i];
  14. var name = tag.name;
  15. if (name== 'img') {
  16. if( typeof(tag.style) == "undefined") {
  17. $( "#mybox").append( '<div><img src="' + tag.datasrc + '" style="width: 100%"><div>');
  18. } else {
  19. $( "#mybox").append( '<div><img src="' + tag.datasrc + '" style="'+tag.style+ '"><div>');
  20. }
  21. } else if (name== 'span' || name == 'p') {
  22. $( "#mybox").append( '<div>' + tag.text+ '</div>');
  23. } else if (name== 'br') {
  24. $( "#mybox").append( '<br/>');
  25. } else if (name== 'a') {
  26. $( "#mybox").append( '<div><a href="' + tag.href+ '" target="_blank"><button>' + tag.text+ '</button> </a></div>');
  27. }
  28. }
  29. } else {
  30. layer.msg(rs.msg, { icon: 2});
  31. }
  32. });
  33. });

注意:在html页面加上这个meta可以解除微信图片防盗机制。


转载:https://blog.csdn.net/sunnyzyq/article/details/103837521
查看评论
* 以上用户言论只代表其个人观点,不代表本网站的观点或立场