提交 37b0eea6 作者: 吕轶伟

lyw:重要故障excel文件处理

父级 22229d3a
No preview for this file type
import json
import requests
from io import BytesIO
from PIL import Image
import base64
def image_to_base64(image):
# 输入为PIL读取的图片,输出为base64格式
byte_data = BytesIO()# 创建一个字节流管道
image.save(byte_data, format="JPEG")# 将图片数据存入字节流管道
byte_data = byte_data.getvalue()# 从字节流管道中获取二进制
base64_str = base64.b64encode(byte_data).decode("ascii")# 二进制转base64
return base64_str
def base64_to_image(base64_str):
# 输入为base64格式字符串,输出为PIL格式图片
byte_data = base64.b64decode(base64_str) # base64转二进制
image = Image.open(BytesIO(byte_data)) # 将二进制转为PIL格式图片
return image
def claude3_respond(prompt):
url = "https://bedrock.chatbot.cn/llm/sse-invoke"
data = {
"model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
"stream": False,
"messages": [{"role": "user", "content": prompt}]
}
print(data)
res = requests.post(url=url, data=json.dumps(data), headers={
"Authorization": "Bearer AKIAXFAXF62IWJXGLVEE.LnKInaahcMZG9zLsGMH3nTLOw3S3lK5Vcu0+ifnO",
"Content-Type": "application/json"})
ret = None
if res.status_code == 200:
try:
ret = res.json()['choices'][0]["message"]["content"]
except Exception as e:
print("")
else:
print(res)
return ret
def claude37_img_respond(img_path, prompt):
url = "https://bedrock.chatbot.cn/llm/sse-invoke"
with open(img_path, "rb") as f:
byte_data = f.read()
base64_str = base64.b64encode(byte_data).decode("ascii")
content = [{"source":{"type":"base64", "media_type":"image/png", "data":base64_str}, "type":"image"}, {"type":"text", "text":prompt}]
data = {
"model": "arn:aws:bedrock:us-east-1:730335234231:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
"stream": False,
"messages": [{"role": "user", "content": content}]
}
res = requests.post(url=url, data=json.dumps(data), headers={
"Authorization": "Bearer AKIA2UC27JC37RYNPXJW.k7eBf5lqD6zI5st6zBSGLts6dmB7d9FI7w21ILxK",
"Content-Type": "application/json"})
ret = ""
if res.status_code == 200:
try:
ret = res.json()['choices'][0]["message"]["content"]
except Exception as e:
print("")
return ret
def claude37_respond(prompt):
url = "https://bedrock.chatbot.cn/llm/sse-invoke"
data = {
"model": "arn:aws:bedrock:us-east-1:730335234231:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
"stream": False,
"messages": [{"role": "user", "content": prompt}]
}
res = requests.post(url=url, data=json.dumps(data), headers={
"Authorization": "Bearer AKIA2UC27JC37RYNPXJW.k7eBf5lqD6zI5st6zBSGLts6dmB7d9FI7w21ILxK",
"Content-Type": "application/json"})
ret = None
if res.status_code == 200:
try:
ret = res.json()['choices'][0]["message"]["content"]
except Exception as e:
print("")
else:
print(ret)
return ret
def claude37_respond_stream(prompt):
url = "https://bedrock.chatbot.cn/llm/sse-invoke"
data = {
"model": "arn:aws:bedrock:us-east-1:730335234231:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
"stream": True,
"messages": [{"role": "user", "content": prompt}]
}
with requests.post(url=url, data=json.dumps(data), headers={
"Authorization": "Bearer AKIA2UC27JC37RYNPXJW.k7eBf5lqD6zI5st6zBSGLts6dmB7d9FI7w21ILxK",
"Content-Type": "application/json"}) as response:
if response.status_code != 200:
yield f"Error: {response.status_code}"
return
# 手动处理SSE流
buffer = ""
for chunk in response.iter_content(chunk_size=1024, decode_unicode=True):
if chunk:
buffer += chunk.decode('utf-8') if isinstance(chunk, bytes) else chunk
while '\n\n' in buffer:
line, buffer = buffer.split('\n\n', 1)
if line.startswith('data: '):
data = line[6:] # 去掉 'data: ' 前缀
try:
parsed = json.loads(data)
if parsed.get("choices") and len(parsed["choices"]) > 0:
delta = parsed["choices"][0].get("delta", {})
if "content" in delta and delta["content"]:
yield delta["content"]
except json.JSONDecodeError:
pass
import json
from time import sleep
import requests
def process(file_path):
url = "http://work.chatbot.cn:38890/chatbot/v2/layout/doc2md"
files = {"file": open(file_path, "rb")}
response = requests.request("POST", url, files=files)
retJson = json.loads(response.text)
taskid = retJson['data']['taskid']
sleep(1)
status_url ="http://work.chatbot.cn:38890/chatbot/taskstatus"
data = {"taskid": taskid}
headers = {"Content-Type": "application/json"}
response = requests.request("POST", status_url, data=json.dumps(data),headers=headers)
retJson = json.loads(response.text)
status = "PROCESSING"
while(not status == "DONE" and not status =="FAIL"):
sleep(2)
response = requests.request("POST", status_url, data=json.dumps(data), headers=headers)
retJson = json.loads(response.text)
status = retJson['data']['status']
return retJson
\ No newline at end of file
import re
import os
def extract_filename(filepath):
# 方法1:使用正则表达式
# 匹配最后一个斜杠或反斜杠后的内容,然后去掉后缀
filename_without_ext = re.sub(r'\.[^.]*$', '', re.search(r'[^\\/]+$', filepath).group())
return filename_without_ext
\ No newline at end of file
import threading
import time
from queue import Queue
from cn.chatbot.module.ClaudeAPI import claude37_img_respond
import cn.chatbot.test.ocr_img as OCR
class BatchOcr():
def __init__(self, data, thread_num=4):
self.dataset = data
self.thread_num = thread_num
self.seamp = threading.BoundedSemaphore(self.thread_num)
def worker(self, in_queue: Queue, out_queue: Queue):
while not in_queue.empty():
img_path = in_queue.get(timeout=10)
# print('in_queue_size', in_queue.qsize())
res = OCR.extract_img_data(img_path)
sample = dict()
sample["img_path"] = img_path
sample["res"] = res
out_queue.put(sample)
self.seamp.acquire()
def count_worker(self, in_queue: Queue, out_queue: Queue):
processed = 0
while not in_queue.empty():
if (out_queue.qsize() > processed):
processed = out_queue.qsize()
print("processed ->" + str(processed))
time.sleep(1)
def run(self):
in_queue = Queue()
for example in self.dataset:
in_queue.put(example)
print('load in_queue_size', in_queue.qsize())
out_queue = Queue()
# 开始请求,获取数据
t_list = []
for ind in range(self.thread_num):
t = threading.Thread(target=self.worker, args=(in_queue, out_queue), daemon=True)
t_list.append(t)
t.start()
t = threading.Thread(target=self.count_worker, args=(in_queue, out_queue), daemon=True)
t_list.append(t)
t.start()
for t in t_list:
t.join()
while self.seamp._value != 0:
time.sleep(1)
ret = dict()
while not out_queue.empty():
rr = out_queue.get()
ret[rr["img_path"]] = rr["res"]
return ret
import re
from cn.chatbot.module.TextInClient import TextInClient
import cn.chatbot.test.ocr_img as OCR
from cn.chatbot.test.BatchOcr import BatchOcr
def process_img_and_ocr(md:str):
# 定义正则表达式模式
pattern = r'(!\[.*?\]\((.*?)\))'
map = dict()
# 使用 re.findall 查找所有匹配项
matches = re.findall(pattern, md)
img_list = []
print("total imgs:" + str(len(matches)))
for match in matches:
orginal_img_path = match[0]
img_path = match[1]
img_list.append(img_path)
map[img_path] = orginal_img_path
batchOcr = BatchOcr(img_list)
res = batchOcr.run()
for r in res:
try:
md = md.replace(map[r], map[r] + "\n\n"+ res[r])
except:
print("replace err")
return md
textInClient = TextInClient()
import os
def read_path_file(path):
ret = []
# 递归遍历文件夹
for dirpath, dirnames, filenames in os.walk(path):
# 遍历所有文件
for filename in filenames:
full_path = os.path.join(dirpath, filename)
if full_path.endswith(".pdf"):
ret.append(full_path)
# 输出文件路径
return ret
in_path = "D:\\works\\layout\\data\\HK"
out_path = "D:\\works\\layout\\data\\HK_MD"
file_list = read_path_file(in_path)
for file in file_list:
print("processing ---> " + file)
md = textInClient.process_pdf(file)
md = process_img_and_ocr(md)
filename = os.path.basename(file)
filename = filename.replace(".pdf", ".md")
with open(os.path.join(out_path, filename), "w", encoding="utf-8") as f:
f.write(md)
import os
from urllib.request import urlretrieve
from cn.chatbot.module.ClaudeAPI import claude37_img_respond
def extract_img_data(img_path):
local_file_path = './tmp/' + os.path.basename(img_path)
# 下载并保存图片
urlretrieve(img_path, local_file_path)
prompt = ("你是一个图片识别机器人,基于我提供给你的图片,识别里面的数据,用表格数据,用markdown格式输出,无需其它解释.\n"
"\n要求:\n"
"(1)若图片中有图表,则用表格恢复完整的数据\n"
"(2)请恢复完整,全面的数据\n"
"(3)请你直接给出抽取到的结果即可,无需解释,无需解释说,你要开始抽取之类的,直接输出表格"
"(4)输出开始示例:|表头1|表头2|\n|----|----|\n|数据1|数据2|")
res = claude37_img_respond(local_file_path, prompt)
return res
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论