# -*- coding:UTF-8 -*- import requests , time ,random import hmac ,json ,base64 from bs4 import BeautifulSoup from hashlib import sha1 import TencentYoutuyun from PIL import Image import uuid def recognition_captcha(data): ''' 识别验证码 ''' file_id = str(uuid.uuid1()) filename = 'captcha_'+ file_id +'.gif' filename_png = 'captcha_'+ file_id +'.png' if(data is None): return data = base64.b64decode(data.encode('utf-8')) with open( filename ,'wb') as fb: fb.write( data ) appid = 'appid' # 接入优图服务,注册账号获取 secret_id = 'secret_id' secret_key = 'secret_key' userid= 'userid' end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化 # 拿到的是gif格式,而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式 im = Image.open( filename) im.save( filename_png ,"png") im.close() result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径,1代表url return result def get_captcha(sessiona,headers): ''' 获取验证码 ''' need_cap = False while( need_cap is not True): try: sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 time.sleep( 0.5 + random.randint(1,9)/10 ) except Exception: continue try: resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据,注意是put img_data = json.loads(resp3.text)["img_base64"] except Exception: return return img_data def create_point( point_data, confidence ): ''' 获得点阵 ''' # 实际操作下,套路不深,x间隔25,y相同,共7个点 ,先模拟意思一下 points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]} wi = 0 input_points = [] for word in ( point_data['items'][0]['words'] ): wi = wi+1 if( word['confidence'] < confidence ): try: input_points.append(points[wi]) # 倒置的中文,优图识别不出来,置信度会低于0.5 except KeyError: continue if( len(input_points) > 2 or len(input_points) == 0 ): return [] # 7个字中只有2个倒置中文的成功率高 result = {} result['img_size']=[200,44] result['input_points']=input_points result = json.dumps(result) print(result) return result def bolting(k_low,k_hi,k3_confidence): ''' 筛选把握大的进行验证 ''' start = time.time() is_success = False while(is_success is not True): points_len = 1 angle = -20 img_ko = [] while(points_len != 21 or angle < k_low or angle > k_hi ): img_data = get_captcha(sessiona,headers) img_ko = recognition_captcha(img_data) ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因,也因为优图自身的原因,此处要特殊处理 # with open( "json.txt" ,'wb') as fb: # fb.write( img_ko_json ) try: points_len = len(img_ko['items'][0]['itemstring']) angle = img_ko['angle'] except Exception: points_len = 1 angle = -20 continue # print(img_ko_json.decode('utf8')) ## stdout用的是utf8,需转码才能正常显示 # print('-'*50) input_text = create_point( img_ko ,k3_confidence ) if(type(input_text) == type([])): continue data = { "input_text":input_text } # 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ,假装思考5秒钟 time.sleep( 4 + random.randint(1,9)/10 ) try: resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers) except Exception: continue print("angle: "+ str(angle) ) print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心 print('-'*50) try: is_success = json.loads(resp5.text)["success"] except KeyError: continue end = time.time() return end-start if __name__ == "__main__": sessiona = requests.Session() headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} k3_confidence = 0.71 ''' # 可视化数据会被保存在云端供浏览 # https://plot.ly/~weldon2010/4 # 纯属学习,并未看出"角度"范围扩大对图像识别的影响,大部分时候60s内能搞定,说明优图还是很强悍的,识别速度也非常快 ''' runtime_list_x = [] runtime_list_y = [] nn = range(1,11) # 愿意的话搞多线程,1百万次更有意思 # 成功尝试100次,形成2维数据以热力图的方式展示 for y in nn : for x in nn : runtime_list_x.append( bolting(-3,3,k3_confidence) ) print( "y: " + str(runtime_list_y) ) print( "x: " + str(runtime_list_x) ) runtime_list_y.append(runtime_list_x.copy()) runtime_list_x = [] print ("-"*30) print( runtime_list_y ) print ("-"*30) # pip install plotly 数据可视化 import plotly import plotly.graph_objs as go plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册 trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ]) data=[trace] plotly.plotly.plot(data, filename='weldon-time2-heatmap') # 尝试后发现一个特点,基本都是1~2个倒置中文,这样我们可以借此提速 # 角度范围放大,仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找 ### chcp 65001 (win下改变cmd字符集) ### python c:\python34\image_recognition_zhihu.py