Open l1t1 opened 1 year ago
可以考虑把file_paths先分好,让不同进程处理不同的图片,然后把初始化CnOcr() 挪到 ocr_worker() 开头。
谢谢
假设我把原始的文件名列表分成了4个: list[0],list[1],list[2],list[3] 怎么编写其余的?
这样不行
#-*- coding: utf-8 -*-
import glob
import multiprocessing
import pytesseract
from PIL import Image
import time
from cnocr import CnOcr
def split_list(a,num):
s=len(a)//num
r=len(a)%num
b=[]
for i in range(num-1):
b.append(a[s*i:s*i+s])
if r>0:
b.append(a[s*(num-1):len(a)])
else:
b.append(a[s*(num-1):s*num])
return b
def ocr_worker(img_path):
try:
# 加载图片
#img = Image.open(img_path)
# 识别文字
ocr = CnOcr()
print(img_path) #输出文件列表
for i in img_path:
text=','.join(sum([[x['text'] for x in ocr.ocr(i)]],[]))
#text = pytesseract.image_to_string(img, lang='chi_sim')
# 输出识别结果
#print(f'{img_path}: {text}')
with open(img_path+'.txt', 'w') as f:
f.write(text)
except:
print(f'{img_path}: OCR Failed')
if __name__ == '__main__':
#def m():
t=time.time()
# 获取所有图片路径
file_paths = glob.glob('d:/test_ocr/*.jpg')
# 创建多个进程,每个进程处理一张图片
num_procs = 4 #multiprocessing.cpu_count()
print('num_procs:',num_procs )
sub=split_list(file_paths,num_procs)
pool = multiprocessing.Pool(processes=num_procs)
for img_path in sub:
pool.apply_async(ocr_worker, (img_path,))
# 关闭进程池
pool.close()
pool.join()
print(time.time()-t)
输出 D:>python test_ocr4.py num_procs: 4 [WARNING 2023-03-17 16:06:53,471 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsupport ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
[WARNING 2023-03-17 16:06:53,567 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsupport ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
[WARNING 2023-03-17 16:06:53,653 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsupport ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
[WARNING 2023-03-17 16:06:53,708 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsupport ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
['d:/test_ocr\1365670868_4787.jpg'] ['d:/test_ocr\1365670999_8693.jpg'] ['d:/test_ocr\webwxgetmsgimg (36).jpg'] ['d:/test_ocr\1365671153_2417.jpg'] ['d:/test_ocr\1365670868_4787.jpg']: OCR Failed ['d:/test_ocr\1365670999_8693.jpg']: OCR Failed ['d:/test_ocr\1365671153_2417.jpg']: OCR Failed ['d:/test_ocr\webwxgetmsgimg (36).jpg']: OCR Failed 8.861506938934326
把open(img_path.. 改成 open(i 以后,好了
改成100个文件,4个列表,cpu24个核心全100%,为什么?
时间比串行还长
D:>python test_ocr4.py num_procs: 4 [WARNING 2023-03-17 16:15:39,752 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsuppor ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
[WARNING 2023-03-17 16:15:39,769 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsuppor ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
[WARNING 2023-03-17 16:15:39,812 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsuppor ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
[WARNING 2023-03-17 16:15:39,973 _showwarnmsg:109] D:\Python38\lib\site-packages\onnxruntime\capi\onnxruntime_validation.py:26: UserWarning: Unsuppor ed Windows version (7). ONNX Runtime supports Windows 10 and above, only. warnings.warn(
['d:/test_ocr/3\001.png', 'd:/test_ocr/3\002.png', 'd:/test_ocr/3\003.png', 'd:/test_ocr/3\004.png', 'd:/test_ocr/3\005.png', 'd:/test_ocr/3\00 .png', 'd:/test_ocr/3\007.png', 'd:/test_ocr/3\008.png', 'd:/test_ocr/3\009.png', 'd:/test_ocr/3\010.png', 'd:/test_ocr/3\011.png', 'd:/test_ocr 3\012.png', 'd:/test_ocr/3\013.png', 'd:/test_ocr/3\014.png', 'd:/test_ocr/3\015.png', 'd:/test_ocr/3\016.png', 'd:/test_ocr/3\017.png', 'd:/te t_ocr/3\018.png', 'd:/test_ocr/3\019.png', 'd:/test_ocr/3\020.png', 'd:/test_ocr/3\021.png', 'd:/test_ocr/3\022.png', 'd:/test_ocr/3\023.png', d:/test_ocr/3\024.png', 'd:/test_ocr/3\025.png'] libpng warning: iCCP: known incorrect sRGB profile ['d:/test_ocr/3\026.png', 'd:/test_ocr/3\027.png', 'd:/test_ocr/3\028.png', 'd:/test_ocr/3\029.png', 'd:/test_ocr/3\030.png', 'd:/test_ocr/3\03 .png', 'd:/test_ocr/3\032.png', 'd:/test_ocr/3\033.png', 'd:/test_ocr/3\034.png', 'd:/test_ocr/3\035.png', 'd:/test_ocr/3\036.png', 'd:/test_ocr 3\037.png', 'd:/test_ocr/3\038.png', 'd:/test_ocr/3\039.png', 'd:/test_ocr/3\040.png', 'd:/test_ocr/3\041.png', 'd:/test_ocr/3\042.png', 'd:/te t_ocr/3\043.png', 'd:/test_ocr/3\044.png', 'd:/test_ocr/3\045.png', 'd:/test_ocr/3\046.png', 'd:/test_ocr/3\047.png', 'd:/test_ocr/3\048.png', d:/test_ocr/3\049.png', 'd:/test_ocr/3\050.png']['d:/test_ocr/3\051.png', 'd:/test_ocr/3\052.png', 'd:/test_ocr/3\053.png', 'd:/test_ocr/3\054. ng', 'd:/test_ocr/3\055.png', 'd:/test_ocr/3\056.png', 'd:/test_ocr/3\057.png', 'd:/test_ocr/3\058.png', 'd:/test_ocr/3\059.png', 'd:/test_ocr/3 \060.png', 'd:/test_ocr/3\061.png', 'd:/test_ocr/3\062.png', 'd:/test_ocr/3\063.png', 'd:/test_ocr/3\064.png', 'd:/test_ocr/3\065.png', 'd:/test ocr/3\066.png', 'd:/test_ocr/3\067.png', 'd:/test_ocr/3\068.png', 'd:/test_ocr/3\069.png', 'd:/test_ocr/3\070.png', 'd:/test_ocr/3\071.png', 'd /test_ocr/3\072.png', 'd:/test_ocr/3\073.png', 'd:/test_ocr/3\074.png', 'd:/test_ocr/3\075.png']
libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile ['d:/test_ocr/3\076.png', 'd:/test_ocr/3\077.png', 'd:/test_ocr/3\078.png', 'd:/test_ocr/3\079.png', 'd:/test_ocr/3\080.png', 'd:/test_ocr/3\08 .png', 'd:/test_ocr/3\082.png', 'd:/test_ocr/3\083.png', 'd:/test_ocr/3\084.png', 'd:/test_ocr/3\085.png', 'd:/test_ocr/3\086.png', 'd:/test_ocr 3\087.png', 'd:/test_ocr/3\088.png', 'd:/test_ocr/3\089.png', 'd:/test_ocr/3\090.png', 'd:/test_ocr/3\091.png', 'd:/test_ocr/3\092.png', 'd:/te t_ocr/3\093.png', 'd:/test_ocr/3\094.png', 'd:/test_ocr/3\095.png', 'd:/test_ocr/3\096.png', 'd:/test_ocr/3\097.png', 'd:/test_ocr/3\098.png', d:/test_ocr/3\099.png', 'd:/test_ocr/3\100.png'] libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile libpng warning: iCCP: known incorrect sRGB profile 375.7554919719696
pytesseract 对同样的100个png文件 20个并行 27秒,CPU 92%
改成100个文件,4个列表,cpu24个核心全100%,为什么?
单个列表,不用并行池,也是cpu24个核心全90%,为什么?
改成100个文件,4个列表,cpu24个核心全100%,为什么?
单个列表,不用并行池,也是cpu24个核心全90%,为什么?
hello,我也是这个问题,你是怎么解决的呢
以下代码可以执行,但是很慢