From ce9a4425d2f4d2d917e4cd30bd35f5f4ab377da3 Mon Sep 17 00:00:00 2001 From: dripsmvcp <138900956+dripsmvcp@users.noreply.github.com> Date: Wed, 20 May 2026 11:14:32 +0900 Subject: [PATCH] fix(imap): handle multi-address headers in _parse_singular_addr (#15006) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the RuntimeError with a warning + first-address fallback so a single email whose From header contains multiple addresses no longer crashes the entire IMAP sync task. Also add regression tests covering: - #14963: RFC 5322 quoted display names with commas (e.g. "Schlüter, Sabine" ) parsed as one address, not two. - #14964: multi-address headers warn instead of raising. Closes #14964 Refs #14963 --- common/data_source/imap_connector.py | 8 +- .../test_imap_connector_addr_parsing.py | 101 ++++++++++++++++++ 2 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 test/unit_test/data_source/test_imap_connector_addr_parsing.py diff --git a/common/data_source/imap_connector.py b/common/data_source/imap_connector.py index a8c1988f6c..2f12e6be91 100644 --- a/common/data_source/imap_connector.py +++ b/common/data_source/imap_connector.py @@ -751,11 +751,11 @@ def _parse_singular_addr(raw_header: str) -> tuple[str, str]: addrs = _parse_addrs(raw_header=raw_header) if not addrs: return ("Unknown", "unknown@example.com") - elif len(addrs) >= 2: - raise RuntimeError( - f"Expected a singular address, but instead got multiple; {raw_header=} {addrs=}" + if len(addrs) >= 2: + logging.warning( + "Multiple addresses in header expected to be singular; using first. parsed_count=%d", + len(addrs), ) - return addrs[0] diff --git a/test/unit_test/data_source/test_imap_connector_addr_parsing.py b/test/unit_test/data_source/test_imap_connector_addr_parsing.py new file mode 100644 index 0000000000..c5a54e671f --- /dev/null +++ b/test/unit_test/data_source/test_imap_connector_addr_parsing.py @@ -0,0 +1,101 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Regression tests for IMAP connector address parsing. + +Covers: +- #14963: ``_parse_addrs`` must respect RFC 5322 quoted display names that + contain commas (e.g. ``"Schlüter, Sabine" ``) and return one address, + not two. +- #14964: ``_parse_singular_addr`` must not raise on multi-address headers; + it should warn and return the first address so a single odd email does not + crash the entire sync. +""" + +import logging + +import pytest + +from common.data_source.imap_connector import _parse_addrs, _parse_singular_addr + +pytestmark = pytest.mark.p2 + + +class TestParseAddrs: + def test_empty_header_returns_empty_list(self): + assert _parse_addrs("") == [] + + def test_simple_address(self): + assert _parse_addrs("user@example.com") == [("", "user@example.com")] + + def test_address_with_display_name(self): + assert _parse_addrs("Alice ") == [ + ("Alice", "alice@example.com") + ] + + def test_quoted_display_name_with_comma_returns_single_address(self): + # #14963: the bug was that ``split(",")`` produced two bogus tuples. + # ``getaddresses`` must keep the quoted "Last, First" as one unit. + result = _parse_addrs('"Schlüter, Sabine" ') + assert result == [("Schlüter, Sabine", "sabine.schlueter@ihklw.de")] + + def test_multiple_addresses_comma_separated(self): + result = _parse_addrs("a@example.com, b@example.com") + assert result == [("", "a@example.com"), ("", "b@example.com")] + + def test_multiple_addresses_with_quoted_comma_in_name(self): + result = _parse_addrs( + '"Wilkens, Michael" , "Müller, Hans" ' + ) + assert result == [ + ("Wilkens, Michael", "m@example.com"), + ("Müller, Hans", "h@example.com"), + ] + + +class TestParseSingularAddr: + def test_empty_header_returns_unknown(self): + assert _parse_singular_addr("") == ("Unknown", "unknown@example.com") + + def test_single_address(self): + assert _parse_singular_addr("Alice ") == ( + "Alice", + "alice@example.com", + ) + + def test_quoted_comma_display_name_does_not_raise(self): + # #14963 cascade: before the fix, ``_parse_addrs`` returned two bogus + # tuples and ``_parse_singular_addr`` then raised RuntimeError. + assert _parse_singular_addr( + '"Schlüter, Sabine" ' + ) == ("Schlüter, Sabine", "sabine.schlueter@ihklw.de") + + def test_multi_address_header_warns_and_returns_first(self, caplog): + # #14964: a legitimately multi-address From header must not crash sync. + header = '"User A" , "User B" ' + with caplog.at_level(logging.WARNING): + result = _parse_singular_addr(header) + assert result == ("User A", "a@example.com") + assert any( + "Multiple addresses" in rec.message for rec in caplog.records + ), f"expected warning about multiple addresses, got: {caplog.records}" + + def test_multi_address_header_does_not_raise(self): + # Explicit guard: no RuntimeError should propagate. + try: + _parse_singular_addr("a@example.com, b@example.com") + except RuntimeError as e: # pragma: no cover - guard only + pytest.fail(f"_parse_singular_addr unexpectedly raised: {e}")