JavaはBaiduFengyunリストのデータを定期的にクロールします



Java Regularly Crawls Data Baidu Fengyun List



Maven

org.springframework.boot spring-boot-starter-jdbc mysql mysql-connector-java org.springframework.boot spring-boot-starter-test test cn.wanghaomiao JsoupXpath 2.3.2 cn.hutool hutool-all 4.5.1

コード

package com.hskj.tvdate.reptile import org.jsoup.Jsoup import org.jsoup.nodes.Document import org.seimicrawler.xpath.JXDocument import org.slf4j.Logger import org.slf4j.LoggerFactory import org.springframework.beans.factory.annotation.Autowired import org.springframework.jdbc.core.JdbcTemplate import org.springframework.scheduling.annotation.Scheduled import org.springframework.stereotype.Component import java.io.UnsupportedEncodingException import java.net.URL import java.net.URLEncoder import java.util.HashMap import java.util.List import java.util.Map import java.util.concurrent.ExecutorService import java.util.concurrent.Executors import cn.hutool.core.date.DateUtil import cn.hutool.http.HttpUtil import cn.hutool.json.JSONObject import cn.hutool.json.JSONUtil /** * @program: tvdate * @description: Crawl data from Baidu Fengyun List * @author: hw * @create: 2020-01-29 17:18 */ @Component public class BaiDuSituation { @Autowired JdbcTemplate jdbcTemplate static ExecutorService executorService = Executors.newFixedThreadPool(30) /** Key-value of movie type and URL */ static final Map MAP = new HashMap() { { put('movie', 'http://top.baidu.com/buzz?b=26&c=1&fr=topcategory_c1') put('TV Drama', 'http://top.baidu.com/buzz?b=4&c=2&fr=topcategory_c2') put('Zhongyi', 'http://top.baidu.com/buzz?b=19&c=3&fr=topcategory_c3') put('Anime', 'http://top.baidu.com/buzz?b=23&c=5&fr=topcategory_c5') put(' ', 'http://top.baidu.com/buzz?b=1677&fr=topbuzz_b23_c5') put('documentary', 'http://top.baidu.com/buzz?b=1678&fr=topbuzz_b23_c5') } } /** After the key-value value of the movie type and movie profile, the character after the UrlEncode of the movie name needs to be added, and the data returned by the link is unicode encoded, which needs to be translated */ static final Map KEY_VALUE = new HashMap() { { put('movie', 'http://top.baidu.com/detail/intro?boardid=26&keyword=') put('TV Drama', 'http://top.baidu.com/detail/intro?boardid=4&keyword=') put('Zhongyi', 'http://top.baidu.com/detail/intro?boardid=19&keyword=') put('Anime', 'http://top.baidu.com/detail/intro?boardid=23&keyword=') put(' ', 'http://top.baidu.com/detail/intro?boardid=1677&keyword=') put('documentary', 'http://top.baidu.com/detail/intro?boardid=1678&keyword=') } } private static final Logger log = LoggerFactory.getLogger(BaiDuSituation.class) /** Method of crawling data */ @Scheduled(cron = '00 00 12 * * ?') public void addBaiduData() throws Exception { String today = DateUtil.today() log.info('Baidu Fengyun List crawling data scheduled task starts to execute') for (Map.Entry url : MAP.entrySet()) { String urls = url.getValue().toString() String type = url.getKey() // request link Document document = Jsoup.parse(new URL(urls).openStream(), 'GBK', urls) JXDocument underTest = JXDocument.create(document.toString()) // title name String title = '//td[@class='keyword']/a[1]/text()' String index = '//td[@class='last']/span/text()' List titles = underTest.sel(title) List indexs = underTest.sel(index) for (int i = 0 i { // title Object titleName = titles.get(finalI) // Index Object index2 = indexs.get(finalI) String urlDeCode = strToUrlDeCode(titleName.toString()) String urli = KEY_VALUE.get(type) + urlDeCode Map jsonToMap = summaryJsonToMap(HttpUtil.get(urli)) Object urlBaike = null Object imageUrl = null Object imageBig = null Object brief = null try { // Encyclopedia urlBaike = jsonToMap.get('url') // Thumbnail cover imageUrl = jsonToMap.get('image') // Large picture cover imageBig = jsonToMap.get('orin_image') // Introduction brief = jsonToMap.get('abstract') } catch (Exception e) { log.info('There is no introduction to this video: {}', titleName) } addBaiduDateBase( titleName, type, brief, index2, urlBaike, imageUrl, imageBig, finalI, today) }) executorService.execute(thread) } } } /** * Convert Chinese characters to urlcode * * @param str Chinese characters * @return urlcode */ public static String strToUrlDeCode(String str) { String deCode = '' try { deCode = URLEncoder.encode(str, 'gb18030') } catch (UnsupportedEncodingException e) { e.printStackTrace() return deCode } return deCode } /** * Convert profile json to map * * @param json profile json * @return map key after conversion: url. Baidu Encyclopedia image. Small picture cover orin_image. Large picture cover abstract. Introduction */ public static Map summaryJsonToMap(String json) { JSONObject content = null try { JSONObject jsonObject = JSONUtil.parseObj(json) content = JSONUtil.parseObj(jsonObject.get('content')) } catch (Exception e) { } return content } public void addBaiduDateBase(Object... obj) { StringBuffer sb = new StringBuffer( 'insert into baidu_situation(create_day,no,title,`type`,brief,`index`,url_baike,image_url,image_big)values(') sb.append( ''' + obj[8] + '','' + obj[7] + '','' + obj[0] + '','' + obj[1] + '','' + obj[2] + '','' + obj[3] + '','' + obj[4] + '','' + obj[5] + '','' + obj[6] + ''') sb.append(')') int update = jdbcTemplate.update(sb.toString()) if (update <1) { log.error('The database failed to insert into the database, the parameters are: {}', obj) } } }

データベース

CREATE TABLE `baidu_situation` ( `id` int(11) NOT NULL AUTO_INCREMENT, `title` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'title title', `type` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'type', `no` int(11) DEFAULT NULL COMMENT'ranking', `brief` varchar(2550) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'Introduction', `index` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'heat index', `url_baike` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'encyclopedia link', `image_url` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'Cover link', `image_big` varchar(500) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'big picture cover link', `create_day` varchar(255) COLLATE utf8_czech_ci DEFAULT NULL COMMENT'The year, month and day of creation are used for indexing', `remark` varchar(255) CHARACTER SET utf8 DEFAULT NULL COMMENT'Remarks', `create_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT'create time', `state` int(11) NOT NULL DEFAULT '0' COMMENT'Data validity (0-valid, 1-valid)', PRIMARY KEY (`id`), UNIQUE KEY `titleDay` (`title`, `type`, `create_day`) USING BTREE COMMENT'Unique index for identification to prevent repeated crawling' ) ENGINE=InnoDB AUTO_INCREMENT=301 DEFAULT CHARSET=utf8 COLLATE=utf8_czech_ci