Python/image_recognition_zhihu.py

203 lines
6.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

# -*- coding:UTF-8 -*-
import requests , time ,random
import hmac ,json ,base64
from bs4 import BeautifulSoup
from hashlib import sha1
import TencentYoutuyun
from PIL import Image
import uuid
def recognition_captcha(data):
''' 识别验证码 '''
file_id = str(uuid.uuid1())
filename = 'captcha_'+ file_id +'.gif'
filename_png = 'captcha_'+ file_id +'.png'
if(data is None):
return
data = base64.b64decode(data.encode('utf-8'))
with open( filename ,'wb') as fb:
fb.write( data )
appid = 'appid' # 接入优图服务,注册账号获取
secret_id = 'secret_id'
secret_key = 'secret_key'
userid= 'userid'
end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT
youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化
# 拿到的是gif格式而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式
im = Image.open( filename)
im.save( filename_png ,"png")
im.close()
result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径1代表url
return result
def get_captcha(sessiona,headers):
''' 获取验证码 '''
need_cap = False
while( need_cap is not True):
try:
sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf
resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket
need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码
time.sleep( 0.5 + random.randint(1,9)/10 )
except Exception:
continue
try:
resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据注意是put
img_data = json.loads(resp3.text)["img_base64"]
except Exception:
return
return img_data
def create_point( point_data, confidence ):
''' 获得点阵 '''
# 实际操作下套路不深x间隔25y相同共7个点 ,先模拟意思一下
points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]}
wi = 0
input_points = []
for word in ( point_data['items'][0]['words'] ):
wi = wi+1
if( word['confidence'] < confidence ):
try:
input_points.append(points[wi]) # 倒置的中文优图识别不出来置信度会低于0.5
except KeyError:
continue
if( len(input_points) > 2 or len(input_points) == 0 ):
return [] # 7个字中只有2个倒置中文的成功率高
result = {}
result['img_size']=[200,44]
result['input_points']=input_points
result = json.dumps(result)
print(result)
return result
def bolting(k_low,k_hi,k3_confidence):
''' 筛选把握大的进行验证 '''
start = time.time()
is_success = False
while(is_success is not True):
points_len = 1
angle = -20
img_ko = []
while(points_len != 21 or angle < k_low or angle > k_hi ):
img_data = get_captcha(sessiona,headers)
img_ko = recognition_captcha(img_data)
## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
# img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False )
# img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因也因为优图自身的原因此处要特殊处理
# with open( "json.txt" ,'wb') as fb:
# fb.write( img_ko_json )
try:
points_len = len(img_ko['items'][0]['itemstring'])
angle = img_ko['angle']
except Exception:
points_len = 1
angle = -20
continue
# print(img_ko_json.decode('utf8')) ## stdout用的是utf8需转码才能正常显示
# print('-'*50)
input_text = create_point( img_ko ,k3_confidence )
if(type(input_text) == type([])):
continue
data = {
"input_text":input_text
}
# 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} 假装思考5秒钟
time.sleep( 4 + random.randint(1,9)/10 )
try:
resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers)
except Exception:
continue
print("angle: "+ str(angle) )
print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心
print('-'*50)
try:
is_success = json.loads(resp5.text)["success"]
except KeyError:
continue
end = time.time()
return end-start
if __name__ == "__main__":
sessiona = requests.Session()
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
k3_confidence = 0.71
'''
# 可视化数据会被保存在云端供浏览
# https://plot.ly/~weldon2010/4
# 纯属学习,并未看出"角度"范围扩大对图像识别的影响大部分时候60s内能搞定说明优图还是很强悍的识别速度也非常快
'''
runtime_list_x = []
runtime_list_y = []
nn = range(1,11) # 愿意的话搞多线程1百万次更有意思
# 成功尝试100次形成2维数据以热力图的方式展示
for y in nn :
for x in nn :
runtime_list_x.append( bolting(-3,3,k3_confidence) )
print( "y: " + str(runtime_list_y) )
print( "x: " + str(runtime_list_x) )
runtime_list_y.append(runtime_list_x.copy())
runtime_list_x = []
print ("-"*30)
print( runtime_list_y )
print ("-"*30)
# pip install plotly 数据可视化
import plotly
import plotly.graph_objs as go
plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册
trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ])
data=[trace]
plotly.plotly.plot(data, filename='weldon-time2-heatmap')
# 尝试后发现一个特点基本都是1~2个倒置中文这样我们可以借此提速
# 角度范围放大仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找
### chcp 65001 (win下改变cmd字符集)
### python c:\python34\image_recognition_zhihu.py