diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py index 58a7d2f82b..abe55b5b27 100644 --- a/common/data_source/confluence_connector.py +++ b/common/data_source/confluence_connector.py @@ -920,7 +920,7 @@ def extract_text_from_confluence_html( confluence_client (Confluence): Confluence client fetched_titles (set[str]): The titles of the pages that have already been fetched Returns: - str: loaded and formated Confluence page + str: loaded and formatted Confluence page """ body = confluence_object["body"] object_html = body.get("storage", body.get("view", {})).get("value") diff --git a/deepdoc/README.md b/deepdoc/README.md index db70e30d80..b8bd15e307 100644 --- a/deepdoc/README.md +++ b/deepdoc/README.md @@ -98,7 +98,7 @@ We use vision information to resolve problems as human being. ```bash python deepdoc/vision/t_recognizer.py --inputs=path_to_images_or_pdfs --threshold=0.2 --mode=tsr --output_dir=path_to_store_result ``` - The inputs could be directory to images or PDF, or a image or PDF. + The inputs could be directory to images or PDF, or an image or PDF. You can look into the folder 'path_to_store_result' where has both images and html pages which demonstrate the detection results as following:
diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 6717a887ae..ad63ded035 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -708,7 +708,7 @@ class RAGFlowPdfParser: def __ocr(self, pagenum, img, chars, ZM=3, device_id: int | None = None): start = timer() bxs = self.ocr.detect(np.array(img), device_id) - logging.info(f"__ocr detecting boxes of a image cost ({timer() - start}s)") + logging.info(f"__ocr detecting boxes of an image cost ({timer() - start}s)") start = timer() if not bxs: diff --git a/deepdoc/vision/table_structure_recognizer.py b/deepdoc/vision/table_structure_recognizer.py index 0cd762576c..e0892c2d72 100644 --- a/deepdoc/vision/table_structure_recognizer.py +++ b/deepdoc/vision/table_structure_recognizer.py @@ -394,7 +394,7 @@ class TableStructureRecognizer(Recognizer): @staticmethod def __desc_table(cap, hdr_rowno, tbl, is_english): - # get text of every colomn in header row to become header text + # get text of every column in header row to become header text clmno = len(tbl[0]) rowno = len(tbl) headers = {} diff --git a/rag/prompts/assign_toc_levels.md b/rag/prompts/assign_toc_levels.md index d35dee7791..ce80c22622 100644 --- a/rag/prompts/assign_toc_levels.md +++ b/rag/prompts/assign_toc_levels.md @@ -1,4 +1,4 @@ -You are given a JSON array of TOC(tabel of content) items. Each item has at least {"title": string} and may include an existing title hierarchical level. +You are given a JSON array of TOC(table of contents) items. Each item has at least {"title": string} and may include an existing title hierarchical level. Task - For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc.