diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index d144cb75b0..a24799e5ab 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -177,6 +177,80 @@ class MarkdownElementExtractor: return ranges + def _table_cells(self, line): + stripped = line.strip() + if "|" not in stripped: + return [] + if stripped.startswith("|"): + stripped = stripped[1:] + if stripped.endswith("|"): + stripped = stripped[:-1] + return [cell.strip() for cell in stripped.split("|")] + + def _is_table_row(self, line): + cells = self._table_cells(line) + return len(cells) >= 2 and any(cell for cell in cells) + + def _is_table_separator_row(self, line): + cells = self._table_cells(line) + return len(cells) >= 2 and all(re.match(r"^:?-{3,}:?$", cell.replace(" ", "")) for cell in cells) + + def _markdown_table_ranges(self, text): + ranges = [] + line_offsets = self._line_start_offsets(text) + + i = 0 + while i < len(self.lines) - 1: + if not self._is_table_row(self.lines[i]) or not self._is_table_separator_row(self.lines[i + 1]): + i += 1 + continue + + end_line = i + 1 + j = i + 2 + while j < len(self.lines) and self._is_table_row(self.lines[j]): + end_line = j + j += 1 + + end_pos = min(len(text), line_offsets[end_line] + len(self.lines[end_line])) + ranges.append((line_offsets[i], end_pos)) + i = end_line + 1 + + return ranges + + def _html_table_ranges(self, text): + table_pattern = re.compile( + r""" + (?: + (?:]*>\s*]*>\s*]*>.*?\s*\s*) + | + (?:]*>\s*]*>.*?\s*) + | + (?:]*>.*?) + ) + """, + re.VERBOSE | re.DOTALL | re.IGNORECASE, + ) + return [(match.start(), match.end()) for match in table_pattern.finditer(text)] + + def _merge_ranges(self, ranges): + if not ranges: + return [] + + merged = [] + for start, end in sorted(ranges): + if not merged or start > merged[-1][1]: + merged.append((start, end)) + else: + merged[-1] = (merged[-1][0], max(merged[-1][1], end)) + return merged + + def _protected_ranges(self, text): + return self._merge_ranges( + self._fenced_code_ranges(text) + + self._markdown_table_ranges(text) + + self._html_table_ranges(text) + ) + def _append_delimited_section(self, sections, text, start, end, include_meta): part = text[start:end] if not part or not part.strip(): @@ -195,9 +269,9 @@ class MarkdownElementExtractor: def _extract_delimited_elements(self, text, delimiters, include_meta=False): sections = [] pattern = re.compile(delimiters) - protected_ranges = self._fenced_code_ranges(text) + protected_ranges = self._protected_ranges(text) if protected_ranges: - logging.debug("markdown_parser: detected %d fenced ranges for delimiter extraction", len(protected_ranges)) + logging.debug("markdown_parser: detected %d protected ranges for delimiter extraction", len(protected_ranges)) protected_idx = 0 last_end = 0 diff --git a/test/unit_test/deepdoc/parser/test_markdown_parser.py b/test/unit_test/deepdoc/parser/test_markdown_parser.py index 8d51b32658..fee848be50 100644 --- a/test/unit_test/deepdoc/parser/test_markdown_parser.py +++ b/test/unit_test/deepdoc/parser/test_markdown_parser.py @@ -106,3 +106,41 @@ class TestMarkdownElementExtractorFences: "````markdown\n```python\nprint('inner')\n```\n````", "After", ] + + +@pytest.mark.p2 +class TestMarkdownElementExtractorTables: + def test_custom_delimiter_preserves_pipe_table(self, markdown_element_extractor): + text = "# Title\n\n| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |\n\nAfter" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`", include_meta=True) + + assert [section["content"] for section in sections] == [ + "# Title", + "| Name | Value |\n| --- | --- |\n| A | 1 |\n| B | 2 |", + "After", + ] + assert sections[1]["start_line"] == 2 + assert sections[1]["end_line"] == 5 + + def test_custom_delimiter_preserves_borderless_pipe_table(self, markdown_element_extractor): + text = "Before\nName | Value\n--- | ---\nA | 1\nB | 2\nAfter" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`") + + assert sections == [ + "Before", + "Name | Value\n--- | ---\nA | 1\nB | 2", + "After", + ] + + def test_custom_delimiter_preserves_html_table(self, markdown_element_extractor): + text = "Before\n\n\n\n
A
B
\nAfter" + + sections = markdown_element_extractor(text).extract_elements(delimiter="`\n`") + + assert sections == [ + "Before", + "\n\n\n
A
B
", + "After", + ]