python学习日志

#Anaconda包管理会更全面些,但支持版本速度太慢
#pip升级区别
pip install --upgrade pip
pip3 install --upgrade pip

#安装虚拟环境
pip3 install virtualenv
#创建虚拟环境
virtualenv -p /usr/local/bin/python3 py3env
#进入虚拟环境
source py3env/bin/activate
#退出虚拟环境
deactivate

#python升级后虚拟环境问题,原来软链已经失效,需要重新建立

ls -lsa py3env/
.Python -> /usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/Python
find ./py3env/ -type l -delete
virtualenv -p /usr/local/bin/python3.6 py3env

#pycurl,openssl升级问题
#https://github.com/siznax/wptools/issues/68

export LDFLAGS="-L/usr/local/opt/openssl/lib"
export CPPFLAGS="-I/usr/local/opt/openssl/include"
pip install --no-cache-dir --compile --ignore-installed --install-option="--with-openssl" pycurl

参考 https://www.cnblogs.com/chenice/p/6994111.html
由于上文是用py2,而且有很多 lib 在 py3中已经不支持了,改造如下:

#准备前
pip3 install selenium
pip3 install  Pillow
pip3 install pytesseract
brew tap caskroom/cask
brew cask install chromedriver

#抓取数据当然是py的强项
#测试浏览器模式
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import time

#对于MacOS由于非x11,所以只能调用系统浏览器gui
#ps https://www.xquartz.org/ mac下安装 x11
import  os
chromedriver = "/usr/local/bin/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

#对于其它支持的情况可以用 dispay 模式,*** 系统要安装 vxfb
from pyvirtualdisplay import Display
display = Display(visible=0, size=(1366, 768))
display.start()

options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')  #这个很重要,在  ubuntu 下不添加会导致 chrome 启动失败
options.add_argument('accept-language="zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7"')
options.add_argument('Host="index.baidu.com"')
options.add_argument('Referer="http://index.baidu.com/"')
options.add_argument('SocketLog="SocketLog(tabid=523&client_id=)"')
options.add_argument('user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"')

#模拟登录,对百度说下道歉,:)
driver.get('http://index.baidu.com/?tpl=trend&word=%B1%A6%C2%ED')
e1 = driver.find_element_by_id("TANGRAM__PSP_4__userName")
e1.send_keys("hulupiao")
time.sleep(1)
e2 = driver.find_element_by_id("TANGRAM__PSP_4__password")
e2.send_keys("xxx")
time.sleep(1)
e3 = driver.find_element_by_id("TANGRAM__PSP_4__submit")
e3.click()
time.sleep(8)

#还可以截屏,强大
driver.get_screenshot_as_file('./1.png')
print("截屏结束.................")
driver.quit()
#图文识别
from PIL import Image
import pytesseract

def binarizing(img,threshold):
    pixdata = img.load()
    w, h = img.size
    for y in range(h):
        for x in range(w):
            if pixdata[x, y] < threshold:
                pixdata[x, y] = 0
            else:
                pixdata[x, y] = 255
    return img

img1=Image.open("./1.png")
w,h=img1.size

#裁剪所需要区域
region1 = (570,670,680,710)
region2 = (756,670,860,710)
cropImg1 = img1.crop(region1)
cropImg2 = img1.crop(region2)
img1= cropImg1.convert('L')
img2= cropImg2.convert('L')
img1=binarizing(img1,200)
img2=binarizing(img2,200)
code1 = pytesseract.image_to_string(img1)
code2 = pytesseract.image_to_string(img2)

print ("total:" , str(code1).replace(".","").replace(" ",''))
print ("mobile:" , str(code2).replace(".","").replace(" ",''))
img1.save('01.png', 'png')
img2.save('02.png', 'png')

pyspider实例

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# Created on 2017-07-19 10:27:52
# Project: winshang

from pyspider.libs.base_handler import *
import mysql.connector
#from mysql.connector import errorcode

class Handler(BaseHandler):
    crawl_config = {
    }

    @every(minutes=24 * 60)
    def on_start(self):
        #i = 1938;
        i = 1938
        while i > 100:
            url = 'http://bizsearch.winshang.com/xiangmu/s0-c0-t0-k0-x0-d0-z0-n0-m0-l0-q0-b0-y0-pn'+str(i)+'.html'
            self.crawl(url, callback=self.index_page)
            i=i-1

    @config(age=10 * 24 * 60 * 60)
    def index_page(self, response):
        for each in response.doc('h2>a[href^="http://biz.winshang.com/html/xm/"]').items():
            self.crawl(each.attr.href, callback=self.detail_page)

    @config(priority=2)
    def detail_page(self, response):
        tmp = '-'
        project = {}
        project['name'] = response.doc('h1.d-brand-tit').text()
        #project['pics'] = response.doc('.carousel-navigation img').attr.src()
        for each in response.doc('ul.d-inf-status>li>span').items():
            each = each.text().strip()
            #print(tmp)
            #print(each)
            #print('---------')
            
            if(tmp == "项目类型"):
                project['type'] = each
            if(tmp == '开业时间'):
                project['start_time'] = each
            if(tmp == '商业面积'):
                project['area'] = each   
            if(tmp == '商业楼层'):
                project['floors'] = each
            if(tmp == '连锁项目'):
                project['is_chain'] = each
            if(tmp == '所在城市'):
                project['city'] = each
            tmp = each
            
        return project
        #print(project)
        
    def on_result(self, result):
        if not result:
            return
        
        cnx = mysql.connector.connect(user='root', database='test')
        cursor = cnx.cursor()
        pre_sql = ("REPLACE INTO winshang "
               "(`name`, `type`,`city`, `start_time`, `area`, `floors`, `is_chain`) "
               "VALUES (%s, %s, %s, %s, %s, %s, %s)")
        data = (result['name'], result['type'],  result['city'], result['start_time'], result['area'], result['floors'], result['is_chain'])
        cursor.execute(pre_sql, data)
        print(cursor.lastrowid)
        cnx.commit()
        cursor.close()
        cnx.close()
        

发表评论

电子邮件地址不会被公开。 必填项已用*标注