lyw:重要故障excel文件处理

37b0eea6 · 吕轶伟 · 22229d3a · 37b0eea6 · 37b0eea6 · 37b0eea6
--- a/requirements.txt
+++ b/requirements.txt
--- a/src/excel/FaultReport.py
+++ b/src/excel/FaultReport.py
--- a/src/tools/ClaudeAPI.py
+++ b/src/tools/ClaudeAPI.py
+import json
+import requests
+from io import BytesIO
+from PIL import Image
+import base64
+def image_to_base64(image):
+    # 输入为PIL读取的图片，输出为base64格式
+    byte_data = BytesIO()# 创建一个字节流管道
+    image.save(byte_data, format="JPEG")# 将图片数据存入字节流管道
+    byte_data = byte_data.getvalue()# 从字节流管道中获取二进制
+    base64_str = base64.b64encode(byte_data).decode("ascii")# 二进制转base64
+    return base64_str
+def base64_to_image(base64_str):
+    # 输入为base64格式字符串，输出为PIL格式图片
+    byte_data = base64.b64decode(base64_str) # base64转二进制
+    image = Image.open(BytesIO(byte_data)) # 将二进制转为PIL格式图片
+    return image
+def claude3_respond(prompt):
+    url = "https://bedrock.chatbot.cn/llm/sse-invoke"
+    data = {
+        "model": "anthropic.claude-3-5-sonnet-20240620-v1:0",
+        "stream": False,
+        "messages": [{"role": "user", "content": prompt}]
+    }
+    print(data)
+    res = requests.post(url=url, data=json.dumps(data), headers={
+        "Authorization": "Bearer AKIAXFAXF62IWJXGLVEE.LnKInaahcMZG9zLsGMH3nTLOw3S3lK5Vcu0+ifnO",
+        "Content-Type": "application/json"})
+    ret = None
+    if res.status_code == 200:
+        try:
+            ret = res.json()['choices'][0]["message"]["content"]
+        except Exception as e:
+            print("")
+    else:
+        print(res)
+    return ret
+def claude37_img_respond(img_path, prompt):
+    url = "https://bedrock.chatbot.cn/llm/sse-invoke"
+    with open(img_path, "rb") as f:
+        byte_data = f.read()
+    base64_str = base64.b64encode(byte_data).decode("ascii")
+    content = [{"source":{"type":"base64", "media_type":"image/png", "data":base64_str}, "type":"image"}, {"type":"text", "text":prompt}]
+    data = {
+        "model": "arn:aws:bedrock:us-east-1:730335234231:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+        "stream": False,
+        "messages": [{"role": "user", "content": content}]
+    }
+    res = requests.post(url=url, data=json.dumps(data), headers={
+        "Authorization": "Bearer AKIA2UC27JC37RYNPXJW.k7eBf5lqD6zI5st6zBSGLts6dmB7d9FI7w21ILxK",
+        "Content-Type": "application/json"})
+    ret = ""
+    if res.status_code == 200:
+        try:
+            ret = res.json()['choices'][0]["message"]["content"]
+        except Exception as e:
+            print("")
+    return ret
+def claude37_respond(prompt):
+    url = "https://bedrock.chatbot.cn/llm/sse-invoke"
+    data = {
+        "model": "arn:aws:bedrock:us-east-1:730335234231:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+        "stream": False,
+        "messages": [{"role": "user", "content": prompt}]
+    }
+    res = requests.post(url=url, data=json.dumps(data), headers={
+        "Authorization": "Bearer AKIA2UC27JC37RYNPXJW.k7eBf5lqD6zI5st6zBSGLts6dmB7d9FI7w21ILxK",
+        "Content-Type": "application/json"})
+    ret = None
+    if res.status_code == 200:
+        try:
+            ret = res.json()['choices'][0]["message"]["content"]
+        except Exception as e:
+            print("")
+    else:
+       print(ret) 
+    return ret
+def claude37_respond_stream(prompt):
+    url = "https://bedrock.chatbot.cn/llm/sse-invoke"
+    data = {
+        "model": "arn:aws:bedrock:us-east-1:730335234231:inference-profile/us.anthropic.claude-3-7-sonnet-20250219-v1:0",
+        "stream": True,
+        "messages": [{"role": "user", "content": prompt}]
+    }
+    with requests.post(url=url, data=json.dumps(data), headers={
+        "Authorization": "Bearer AKIA2UC27JC37RYNPXJW.k7eBf5lqD6zI5st6zBSGLts6dmB7d9FI7w21ILxK",
+        "Content-Type": "application/json"}) as response:
+        if response.status_code != 200:
+                    yield f"Error: {response.status_code}"
+                    return
+        # 手动处理SSE流
+        buffer = ""
+        for chunk in response.iter_content(chunk_size=1024, decode_unicode=True):
+            if chunk:
+                buffer += chunk.decode('utf-8') if isinstance(chunk, bytes) else chunk
+                while '\n\n' in buffer:
+                    line, buffer = buffer.split('\n\n', 1)
+                    if line.startswith('data: '):
+                        data = line[6:]  # 去掉 'data: ' 前缀
+                        try:
+                            parsed = json.loads(data)
+                            if parsed.get("choices") and len(parsed["choices"]) > 0:
+                                delta = parsed["choices"][0].get("delta", {})
+                                if "content" in delta and delta["content"]:
+                                    yield delta["content"]
+                        except json.JSONDecodeError:
+                            pass
--- a/src/tools/LayoutClient.py
+++ b/src/tools/LayoutClient.py
+import json
+from time import sleep
+import requests
+def process(file_path):
+    url = "http://work.chatbot.cn:38890/chatbot/v2/layout/doc2md"
+    files = {"file": open(file_path, "rb")}
+    response = requests.request("POST", url, files=files)
+    retJson = json.loads(response.text)
+    taskid = retJson['data']['taskid']
+    sleep(1)
+    status_url ="http://work.chatbot.cn:38890/chatbot/taskstatus"
+    data = {"taskid": taskid}
+    headers = {"Content-Type": "application/json"}
+    response = requests.request("POST", status_url, data=json.dumps(data),headers=headers)
+    retJson = json.loads(response.text)
+    status = "PROCESSING"
+    while(not status == "DONE" and not status =="FAIL"):
+        sleep(2)
+        response = requests.request("POST", status_url, data=json.dumps(data), headers=headers)
+        retJson = json.loads(response.text)
+        status = retJson['data']['status']
+    return retJson
\ No newline at end of file
--- a/src/tools/fileUtil.py
+++ b/src/tools/fileUtil.py
+import re
+import os
+def extract_filename(filepath):
+    # 方法1：使用正则表达式
+    # 匹配最后一个斜杠或反斜杠后的内容，然后去掉后缀
+    filename_without_ext = re.sub(r'\.[^.]*$', '', re.search(r'[^\\/]+$', filepath).group())
+    return filename_without_ext
\ No newline at end of file
--- a/src/tools/ocrImage/BatchOcr.py
+++ b/src/tools/ocrImage/BatchOcr.py
+import threading
+import time
+from queue import Queue
+from cn.chatbot.module.ClaudeAPI import claude37_img_respond
+import cn.chatbot.test.ocr_img as OCR
+class BatchOcr():
+    def __init__(self, data, thread_num=4):
+        self.dataset = data
+        self.thread_num = thread_num
+        self.seamp = threading.BoundedSemaphore(self.thread_num)
+    def worker(self, in_queue: Queue, out_queue: Queue):
+        while not in_queue.empty():
+            img_path = in_queue.get(timeout=10)
+            # print('in_queue_size', in_queue.qsize())
+            res = OCR.extract_img_data(img_path)
+            sample = dict()
+            sample["img_path"] = img_path
+            sample["res"] = res
+            out_queue.put(sample)
+        self.seamp.acquire()
+    def count_worker(self, in_queue: Queue, out_queue: Queue):
+        processed = 0
+        while not in_queue.empty():
+            if (out_queue.qsize() > processed):
+                processed = out_queue.qsize()
+                print("processed ->" + str(processed))
+            time.sleep(1)
+    def run(self):
+        in_queue = Queue()
+        for example in self.dataset:
+            in_queue.put(example)
+        print('load in_queue_size', in_queue.qsize())
+        out_queue = Queue()
+        # 开始请求，获取数据
+        t_list = []
+        for ind in range(self.thread_num):
+            t = threading.Thread(target=self.worker, args=(in_queue, out_queue), daemon=True)
+            t_list.append(t)
+            t.start()
+        t = threading.Thread(target=self.count_worker, args=(in_queue, out_queue), daemon=True)
+        t_list.append(t)
+        t.start()
+        for t in t_list:
+            t.join()
+        while self.seamp._value != 0:
+            time.sleep(1)
+        ret = dict()
+        while not out_queue.empty():
+            rr = out_queue.get()
+            ret[rr["img_path"]] = rr["res"]
+        return ret
--- a/src/tools/ocrImage/layoutAndOCR.py
+++ b/src/tools/ocrImage/layoutAndOCR.py
+import re
+from cn.chatbot.module.TextInClient import TextInClient
+import cn.chatbot.test.ocr_img as OCR
+from cn.chatbot.test.BatchOcr import BatchOcr
+def process_img_and_ocr(md:str):
+    # 定义正则表达式模式
+    pattern = r'(!\[.*?\]\((.*?)\))'
+    map = dict()
+    # 使用 re.findall 查找所有匹配项
+    matches = re.findall(pattern, md)
+    img_list = []
+    print("total imgs：" + str(len(matches)))
+    for match in matches:
+        orginal_img_path = match[0]
+        img_path = match[1]
+        img_list.append(img_path)
+        map[img_path] = orginal_img_path
+    batchOcr = BatchOcr(img_list)
+    res = batchOcr.run()
+    for r in res:
+        try:
+            md = md.replace(map[r], map[r] + "\n\n"+ res[r])
+        except:
+            print("replace err")
+    return md
+textInClient = TextInClient()
+import os
+def read_path_file(path):
+    ret = []
+    # 递归遍历文件夹
+    for dirpath, dirnames, filenames in os.walk(path):
+        # 遍历所有文件
+        for filename in filenames:
+            full_path = os.path.join(dirpath, filename)
+            if full_path.endswith(".pdf"):
+                ret.append(full_path)
+            # 输出文件路径
+    return ret
+in_path = "D:\\works\\layout\\data\\HK"
+out_path = "D:\\works\\layout\\data\\HK_MD"
+file_list = read_path_file(in_path)
+for file in file_list:
+    print("processing ---> " + file)
+    md = textInClient.process_pdf(file)
+    md = process_img_and_ocr(md)
+    filename = os.path.basename(file)
+    filename = filename.replace(".pdf", ".md")
+    with open(os.path.join(out_path, filename), "w", encoding="utf-8") as f:
+        f.write(md)
--- a/src/tools/ocrImage/ocr_img.py
+++ b/src/tools/ocrImage/ocr_img.py
+import os
+from urllib.request import urlretrieve
+from cn.chatbot.module.ClaudeAPI import claude37_img_respond
+def  extract_img_data(img_path):
+    local_file_path = './tmp/' + os.path.basename(img_path)
+    # 下载并保存图片
+    urlretrieve(img_path, local_file_path)
+    prompt = ("你是一个图片识别机器人，基于我提供给你的图片，识别里面的数据，用表格数据，用markdown格式输出，无需其它解释.\n"
+              "\n要求：\n"
+              "（1）若图片中有图表，则用表格恢复完整的数据\n"
+              "（2）请恢复完整，全面的数据\n"
+              "（3）请你直接给出抽取到的结果即可，无需解释，无需解释说，你要开始抽取之类的，直接输出表格"
+              "（4）输出开始示例：|表头1|表头2|\n|----|----|\n|数据1|数据2|")
+    res = claude37_img_respond(local_file_path, prompt)
+    return res
--- a/src/tools/server.py
+++ b/src/tools/server.py