#Anaconda包管理会更全面些,但支持版本速度太慢 #pip升级区别 pip install --upgrade pip pip3 install --upgrade pip #安装虚拟环境 pip3 install virtualenv #创建虚拟环境 virtualenv -p /usr/local/bin/python3 py3env #进入虚拟环境 source py3env/bin/activate #退出虚拟环境 deactivate
#python升级后虚拟环境问题,原来软链已经失效,需要重新建立
ls -lsa py3env/ .Python -> /usr/local/Cellar/python3/3.6.2/Frameworks/Python.framework/Versions/3.6/Python find ./py3env/ -type l -delete virtualenv -p /usr/local/bin/python3.6 py3env
#pycurl,openssl升级问题
#https://github.com/siznax/wptools/issues/68
export LDFLAGS="-L/usr/local/opt/openssl/lib" export CPPFLAGS="-I/usr/local/opt/openssl/include" pip install --no-cache-dir --compile --ignore-installed --install-option="--with-openssl" pycurl
参考 https://www.cnblogs.com/chenice/p/6994111.html
由于上文是用py2,而且有很多 lib 在 py3中已经不支持了,改造如下:
#准备前 pip3 install selenium pip3 install Pillow pip3 install pytesseract brew tap caskroom/cask brew cask install chromedriver #抓取数据当然是py的强项 #测试浏览器模式 from selenium import webdriver from selenium.webdriver.chrome.options import Options import time #对于MacOS由于非x11,所以只能调用系统浏览器gui #ps https://www.xquartz.org/ mac下安装 x11 import os chromedriver = "/usr/local/bin/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver driver = webdriver.Chrome(chromedriver) #对于其它支持的情况可以用 dispay 模式,*** 系统要安装 vxfb from pyvirtualdisplay import Display display = Display(visible=0, size=(1366, 768)) display.start() options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--no-sandbox') #这个很重要,在 ubuntu 下不添加会导致 chrome 启动失败 options.add_argument('accept-language="zh-CN,zh;q=0.9,en;q=0.8,en-US;q=0.7"') options.add_argument('Host="index.baidu.com"') options.add_argument('Referer="http://index.baidu.com/"') options.add_argument('SocketLog="SocketLog(tabid=523&client_id=)"') options.add_argument('user-agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36"') #模拟登录,对百度说下道歉,:) driver.get('http://index.baidu.com/?tpl=trend&word=%B1%A6%C2%ED') e1 = driver.find_element_by_id("TANGRAM__PSP_4__userName") e1.send_keys("hulupiao") time.sleep(1) e2 = driver.find_element_by_id("TANGRAM__PSP_4__password") e2.send_keys("xxx") time.sleep(1) e3 = driver.find_element_by_id("TANGRAM__PSP_4__submit") e3.click() time.sleep(8) #还可以截屏,强大 driver.get_screenshot_as_file('./1.png') print("截屏结束.................") driver.quit()
#图文识别 from PIL import Image import pytesseract def binarizing(img,threshold): pixdata = img.load() w, h = img.size for y in range(h): for x in range(w): if pixdata[x, y] < threshold: pixdata[x, y] = 0 else: pixdata[x, y] = 255 return img img1=Image.open("./1.png") w,h=img1.size #裁剪所需要区域 region1 = (570,670,680,710) region2 = (756,670,860,710) cropImg1 = img1.crop(region1) cropImg2 = img1.crop(region2) img1= cropImg1.convert('L') img2= cropImg2.convert('L') img1=binarizing(img1,200) img2=binarizing(img2,200) code1 = pytesseract.image_to_string(img1) code2 = pytesseract.image_to_string(img2) print ("total:" , str(code1).replace(".","").replace(" ",'')) print ("mobile:" , str(code2).replace(".","").replace(" ",'')) img1.save('01.png', 'png') img2.save('02.png', 'png')
pyspider实例
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2017-07-19 10:27:52 # Project: winshang from pyspider.libs.base_handler import * import mysql.connector #from mysql.connector import errorcode class Handler(BaseHandler): crawl_config = { } @every(minutes=24 * 60) def on_start(self): #i = 1938; i = 1938 while i > 100: url = 'http://bizsearch.winshang.com/xiangmu/s0-c0-t0-k0-x0-d0-z0-n0-m0-l0-q0-b0-y0-pn'+str(i)+'.html' self.crawl(url, callback=self.index_page) i=i-1 @config(age=10 * 24 * 60 * 60) def index_page(self, response): for each in response.doc('h2>a[href^="http://biz.winshang.com/html/xm/"]').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): tmp = '-' project = {} project['name'] = response.doc('h1.d-brand-tit').text() #project['pics'] = response.doc('.carousel-navigation img').attr.src() for each in response.doc('ul.d-inf-status>li>span').items(): each = each.text().strip() #print(tmp) #print(each) #print('---------') if(tmp == "项目类型"): project['type'] = each if(tmp == '开业时间'): project['start_time'] = each if(tmp == '商业面积'): project['area'] = each if(tmp == '商业楼层'): project['floors'] = each if(tmp == '连锁项目'): project['is_chain'] = each if(tmp == '所在城市'): project['city'] = each tmp = each return project #print(project) def on_result(self, result): if not result: return cnx = mysql.connector.connect(user='root', database='test') cursor = cnx.cursor() pre_sql = ("REPLACE INTO winshang " "(`name`, `type`,`city`, `start_time`, `area`, `floors`, `is_chain`) " "VALUES (%s, %s, %s, %s, %s, %s, %s)") data = (result['name'], result['type'], result['city'], result['start_time'], result['area'], result['floors'], result['is_chain']) cursor.execute(pre_sql, data) print(cursor.lastrowid) cnx.commit() cursor.close() cnx.close()