#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-05-28 07:35:21
# Project: chengyu
from pyspider.libs.base_handler import *
import re
import mysql.connector
import datetime, time
class Handler(BaseHandler):
crawl_config = {
# "proxy":ip,
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 SocketLog(tabid=2880&client_id=)",
"timeout": 6000,
#"connect_timeout": 3000,
"retries": 5,
"auto_recrawl": True,
}
@config(age=5)
def on_start(self):
for i in range(1, 199):
if i==1:
self.crawl('http://www.zd9999.com/cy/', callback=self.index_page)
else:
self.crawl('http://www.zd9999.com/cy/index_'+str(i)+'.htm', callback=self.index_page)
@config(age=5)
def index_page(self, response):
for each in response.doc('a[href^="http://www.zd9999.com/cy/htm"]').items():
print(each.attr.href)
self.crawl(each.attr.href, callback=self.detail_page, timeout = 60)
@config(priority=2)
def detail_page(self, response):
c = response.doc('font > b').text()
s = response.doc('.td1o1 td').text()
arr = re.match(r'拼音: (.*)释义: (.*)出处: (.*)示例: (.*)', s, re.M|re.I)
rs = {
"ciyu": c,
"pinyin": arr.group(1),
"shiyi": arr.group(2),
"chuchu": arr.group(3),
"shili": arr.group(4),
}
return rs
def on_result(self, result):
print(result)
if not result:
return
cnx = mysql.connector.connect(user='root', database='test', password='12345678',host='localhost', )
cursor = cnx.cursor()
cursor.execute("select * from chengyu where ciyu=%s", (result["ciyu"],));
r = cursor.fetchone()
print(r)
if r:
print('have')
else:
query = ("INSERT INTO chengyu "
"(ciyu, pinyin, chuchu, shili,shiyi) "
"VALUES (%s, %s, %s, %s, %s)")
arr = (result["ciyu"], result["pinyin"], result["chuchu"], result["shili"], result["shiyi"])
cursor.execute(query, arr)
cnx.commit()
cursor.close()
cnx.close()