diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index d144cb75b0..a24799e5ab 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -177,6 +177,80 @@ class MarkdownElementExtractor:
return ranges
+ def _table_cells(self, line):
+ stripped = line.strip()
+ if "|" not in stripped:
+ return []
+ if stripped.startswith("|"):
+ stripped = stripped[1:]
+ if stripped.endswith("|"):
+ stripped = stripped[:-1]
+ return [cell.strip() for cell in stripped.split("|")]
+
+ def _is_table_row(self, line):
+ cells = self._table_cells(line)
+ return len(cells) >= 2 and any(cell for cell in cells)
+
+ def _is_table_separator_row(self, line):
+ cells = self._table_cells(line)
+ return len(cells) >= 2 and all(re.match(r"^:?-{3,}:?$", cell.replace(" ", "")) for cell in cells)
+
+ def _markdown_table_ranges(self, text):
+ ranges = []
+ line_offsets = self._line_start_offsets(text)
+
+ i = 0
+ while i < len(self.lines) - 1:
+ if not self._is_table_row(self.lines[i]) or not self._is_table_separator_row(self.lines[i + 1]):
+ i += 1
+ continue
+
+ end_line = i + 1
+ j = i + 2
+ while j < len(self.lines) and self._is_table_row(self.lines[j]):
+ end_line = j
+ j += 1
+
+ end_pos = min(len(text), line_offsets[end_line] + len(self.lines[end_line]))
+ ranges.append((line_offsets[i], end_pos))
+ i = end_line + 1
+
+ return ranges
+
+ def _html_table_ranges(self, text):
+ table_pattern = re.compile(
+ r"""
+ (?:
+ (?:]*>\s*
]*>\s*\s*\s*)
+ |
+ (?:]*>\s*\s*)
+ |
+ (?:)
+ )
+ """,
+ re.VERBOSE | re.DOTALL | re.IGNORECASE,
+ )
+ return [(match.start(), match.end()) for match in table_pattern.finditer(text)]
+
+ def _merge_ranges(self, ranges):
+ if not ranges:
+ return []
+
+ merged = []
+ for start, end in sorted(ranges):
+ if not merged or start > merged[-1][1]:
+ merged.append((start, end))
+ else:
+ merged[-1] = (merged[-1][0], max(merged[-1][1], end))
+ return merged
+
+ def _protected_ranges(self, text):
+ return self._merge_ranges(
+ self._fenced_code_ranges(text)
+ + self._markdown_table_ranges(text)
+ + self._html_table_ranges(text)
+ )
+
def _append_delimited_section(self, sections, text, start, end, include_meta):
part = text[start:end]
if not part or not part.strip():
@@ -195,9 +269,9 @@ class MarkdownElementExtractor:
def _extract_delimited_elements(self, text, delimiters, include_meta=False):
sections = []
pattern = re.compile(delimiters)
- protected_ranges = self._fenced_code_ranges(text)
+ protected_ranges = self._protected_ranges(text)
if protected_ranges:
- logging.debug("markdown_parser: detected %d fenced ranges for delimiter extraction", len(protected_ranges))
+ logging.debug("markdown_parser: detected %d protected ranges for delimiter extraction", len(protected_ranges))
protected_idx = 0
last_end = 0
diff --git a/test/unit_test/deepdoc/parser/test_markdown_parser.py b/test/unit_test/deepdoc/parser/test_markdown_parser.py
index 8d51b32658..fee848be50 100644
--- a/test/unit_test/deepdoc/parser/test_markdown_parser.py
+++ b/test/unit_test/deepdoc/parser/test_markdown_parser.py
@@ -106,3 +106,41 @@ class TestMarkdownElementExtractorFences:
"````markdown\n```python\nprint('inner')\n```\n````",
"After",
]
+
+
+@pytest.mark.p2
+class TestMarkdownElementExtractorTables:
+ def test_custom_delimiter_preserves_pipe_table(self, markdown_element_extractor):
+ text = "# Title\n\n| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |\n\nAfter"
+
+ sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`", include_meta=True)
+
+ assert [section["content"] for section in sections] == [
+ "# Title",
+ "| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |",
+ "After",
+ ]
+ assert sections[1]["start_line"] == 2
+ assert sections[1]["end_line"] == 5
+
+ def test_custom_delimiter_preserves_borderless_pipe_table(self, markdown_element_extractor):
+ text = "Before\nName | Value\n--- | ---\nA | 1\nB | 2\nAfter"
+
+ sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
+
+ assert sections == [
+ "Before",
+ "Name | Value\n--- | ---\nA | 1\nB | 2",
+ "After",
+ ]
+
+ def test_custom_delimiter_preserves_html_table(self, markdown_element_extractor):
+ text = "Before\n\nAfter"
+
+ sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`")
+
+ assert sections == [
+ "Before",
+ "",
+ "After",
+ ]