抓成语

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2020-05-28 07:35:21
# Project: chengyu

from pyspider.libs.base_handler import *
import re
import mysql.connector
import datetime, time

class Handler(BaseHandler):
    crawl_config = {
        # "proxy":ip,
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36 SocketLog(tabid=2880&client_id=)",
        "timeout": 6000,
        #"connect_timeout": 3000,
        "retries": 5,
        "auto_recrawl": True,
    }

    @config(age=5)
    def on_start(self):
        for i in range(1, 199):
            if i==1:
                self.crawl('http://www.zd9999.com/cy/', callback=self.index_page)
            else:
                self.crawl('http://www.zd9999.com/cy/index_'+str(i)+'.htm', callback=self.index_page)
            
    @config(age=5)
    def index_page(self, response):
        for each in response.doc('a[href^="http://www.zd9999.com/cy/htm"]').items():
            print(each.attr.href)
            self.crawl(each.attr.href, callback=self.detail_page, timeout = 60)

    @config(priority=2)
    def detail_page(self, response):
        c = response.doc('font > b').text()
        s = response.doc('.td1o1 td').text()
        arr = re.match(r'拼音: (.*)释义: (.*)出处: (.*)示例: (.*)', s, re.M|re.I)
        rs = {
            "ciyu": c,
            "pinyin": arr.group(1),
            "shiyi": arr.group(2),
            "chuchu": arr.group(3),
            "shili": arr.group(4),
        }
        return rs
    
    def on_result(self, result):
        print(result)
        if not result:
            return
        cnx = mysql.connector.connect(user='root', database='test', password='12345678',host='localhost', )
        cursor = cnx.cursor()
        cursor.execute("select * from chengyu where ciyu=%s", (result["ciyu"],));
        r = cursor.fetchone()
        print(r)
        if r:
            print('have')
        else:
            query = ("INSERT INTO chengyu "
                        "(ciyu, pinyin, chuchu, shili,shiyi) "
                        "VALUES (%s, %s, %s, %s, %s)")
            arr = (result["ciyu"], result["pinyin"], result["chuchu"], result["shili"], result["shiyi"])
            cursor.execute(query, arr)
            cnx.commit()        
        cursor.close()
        cnx.close()

发表评论

电子邮件地址不会被公开。 必填项已用*标注