Files
ragflow/common/http_client.py

252 lines
9.4 KiB
Python
Raw Permalink Normal View History

# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import logging
import os
import time
from typing import Any, Dict, Optional
Potential fix for code scanning alert no. 57: Clear-text logging of sensitive information (#12071) Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/57](https://github.com/infiniflow/ragflow/security/code-scanning/57) In general, the safest fix is to ensure that any logging of request URLs from `async_request` (and similar helpers) cannot include secrets. This can be done by (a) suppressing logging entirely for URLs considered sensitive, or (b) logging only a non-sensitive subset (e.g., scheme + host + path) and never query strings or credentials. The minimal, backward-compatible change here is to strengthen `_redact_sensitive_url_params` and `_is_sensitive_url` / the logging call so that we never log query parameters at all. Instead of logging the full URL (with redacted query), we can log only `scheme://netloc/path` and optionally strip userinfo. This retains useful observability (which endpoint, which method, response code, timing) while guaranteeing that no secrets in query strings or path segments appear in logs. Concretely: - Update `_redact_sensitive_url_params` to *not* include the query string in the returned value, and to drop any embedded userinfo (`username:password@host`). - Continue to wrap logging in a “sensitive URL” guard, but now the redaction routine itself ensures no secrets from query are present. - Leave callers (e.g., `github_callback`, `feishu_callback`) unchanged, since they only pass URLs and do not control the logging behavior directly. All changes are confined to `common/http_client.py` inside the provided snippet. No new imports are necessary. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-12-22 13:31:03 +08:00
from urllib.parse import urlparse, urlunparse
from common import settings
import httpx
logger = logging.getLogger(__name__)
# Default knobs; keep conservative to avoid unexpected behavioural changes.
DEFAULT_TIMEOUT = float(os.environ.get("HTTP_CLIENT_TIMEOUT", "15"))
# Align with requests default: follow redirects with a max of 30 unless overridden.
DEFAULT_FOLLOW_REDIRECTS = bool(
int(os.environ.get("HTTP_CLIENT_FOLLOW_REDIRECTS", "1"))
)
DEFAULT_MAX_REDIRECTS = int(os.environ.get("HTTP_CLIENT_MAX_REDIRECTS", "30"))
DEFAULT_MAX_RETRIES = int(os.environ.get("HTTP_CLIENT_MAX_RETRIES", "2"))
DEFAULT_BACKOFF_FACTOR = float(os.environ.get("HTTP_CLIENT_BACKOFF_FACTOR", "0.5"))
DEFAULT_PROXY = os.environ.get("HTTP_CLIENT_PROXY")
DEFAULT_USER_AGENT = os.environ.get("HTTP_CLIENT_USER_AGENT", "ragflow-http-client")
def _clean_headers(
headers: Optional[Dict[str, str]], auth_token: Optional[str] = None
) -> Optional[Dict[str, str]]:
merged_headers: Dict[str, str] = {}
if DEFAULT_USER_AGENT:
merged_headers["User-Agent"] = DEFAULT_USER_AGENT
if auth_token:
merged_headers["Authorization"] = auth_token
if headers is None:
return merged_headers or None
merged_headers.update({str(k): str(v) for k, v in headers.items() if v is not None})
return merged_headers or None
def _get_delay(backoff_factor: float, attempt: int) -> float:
return backoff_factor * (2**attempt)
# List of sensitive parameters to redact from URLs before logging
_SENSITIVE_QUERY_KEYS = {"client_secret", "secret", "code", "access_token", "refresh_token", "password", "token", "app_secret"}
def _redact_sensitive_url_params(url: str) -> str:
Potential fix for code scanning alert no. 57: Clear-text logging of sensitive information (#12071) Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/57](https://github.com/infiniflow/ragflow/security/code-scanning/57) In general, the safest fix is to ensure that any logging of request URLs from `async_request` (and similar helpers) cannot include secrets. This can be done by (a) suppressing logging entirely for URLs considered sensitive, or (b) logging only a non-sensitive subset (e.g., scheme + host + path) and never query strings or credentials. The minimal, backward-compatible change here is to strengthen `_redact_sensitive_url_params` and `_is_sensitive_url` / the logging call so that we never log query parameters at all. Instead of logging the full URL (with redacted query), we can log only `scheme://netloc/path` and optionally strip userinfo. This retains useful observability (which endpoint, which method, response code, timing) while guaranteeing that no secrets in query strings or path segments appear in logs. Concretely: - Update `_redact_sensitive_url_params` to *not* include the query string in the returned value, and to drop any embedded userinfo (`username:password@host`). - Continue to wrap logging in a “sensitive URL” guard, but now the redaction routine itself ensures no secrets from query are present. - Leave callers (e.g., `github_callback`, `feishu_callback`) unchanged, since they only pass URLs and do not control the logging behavior directly. All changes are confined to `common/http_client.py` inside the provided snippet. No new imports are necessary. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-12-22 13:31:03 +08:00
"""
Return a version of the URL that is safe to log.
We intentionally drop query parameters and userinfo to avoid leaking
credentials or tokens via logs. Only scheme, host, port and path
are preserved.
"""
try:
parsed = urlparse(url)
Potential fix for code scanning alert no. 57: Clear-text logging of sensitive information (#12071) Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/57](https://github.com/infiniflow/ragflow/security/code-scanning/57) In general, the safest fix is to ensure that any logging of request URLs from `async_request` (and similar helpers) cannot include secrets. This can be done by (a) suppressing logging entirely for URLs considered sensitive, or (b) logging only a non-sensitive subset (e.g., scheme + host + path) and never query strings or credentials. The minimal, backward-compatible change here is to strengthen `_redact_sensitive_url_params` and `_is_sensitive_url` / the logging call so that we never log query parameters at all. Instead of logging the full URL (with redacted query), we can log only `scheme://netloc/path` and optionally strip userinfo. This retains useful observability (which endpoint, which method, response code, timing) while guaranteeing that no secrets in query strings or path segments appear in logs. Concretely: - Update `_redact_sensitive_url_params` to *not* include the query string in the returned value, and to drop any embedded userinfo (`username:password@host`). - Continue to wrap logging in a “sensitive URL” guard, but now the redaction routine itself ensures no secrets from query are present. - Leave callers (e.g., `github_callback`, `feishu_callback`) unchanged, since they only pass URLs and do not control the logging behavior directly. All changes are confined to `common/http_client.py` inside the provided snippet. No new imports are necessary. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-12-22 13:31:03 +08:00
# Remove any potential userinfo (username:password@)
netloc = parsed.hostname or ""
if parsed.port:
netloc = f"{netloc}:{parsed.port}"
# Reconstruct URL without query, params, fragment, or userinfo.
safe_url = urlunparse(
(
parsed.scheme,
netloc,
parsed.path,
"", # params
"", # query
"", # fragment
)
)
return safe_url
except Exception:
Potential fix for code scanning alert no. 57: Clear-text logging of sensitive information (#12071) Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/57](https://github.com/infiniflow/ragflow/security/code-scanning/57) In general, the safest fix is to ensure that any logging of request URLs from `async_request` (and similar helpers) cannot include secrets. This can be done by (a) suppressing logging entirely for URLs considered sensitive, or (b) logging only a non-sensitive subset (e.g., scheme + host + path) and never query strings or credentials. The minimal, backward-compatible change here is to strengthen `_redact_sensitive_url_params` and `_is_sensitive_url` / the logging call so that we never log query parameters at all. Instead of logging the full URL (with redacted query), we can log only `scheme://netloc/path` and optionally strip userinfo. This retains useful observability (which endpoint, which method, response code, timing) while guaranteeing that no secrets in query strings or path segments appear in logs. Concretely: - Update `_redact_sensitive_url_params` to *not* include the query string in the returned value, and to drop any embedded userinfo (`username:password@host`). - Continue to wrap logging in a “sensitive URL” guard, but now the redaction routine itself ensures no secrets from query are present. - Leave callers (e.g., `github_callback`, `feishu_callback`) unchanged, since they only pass URLs and do not control the logging behavior directly. All changes are confined to `common/http_client.py` inside the provided snippet. No new imports are necessary. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
2025-12-22 13:31:03 +08:00
# If parsing fails, fall back to omitting the URL entirely.
return "<redacted-url>"
def _is_sensitive_url(url: str) -> bool:
"""Return True if URL is one of the configured OAuth endpoints."""
# Collect known sensitive endpoint URLs from settings
oauth_urls = set()
# GitHub OAuth endpoints
try:
if settings.GITHUB_OAUTH is not None:
url_val = settings.GITHUB_OAUTH.get("url")
if url_val:
oauth_urls.add(url_val)
except Exception:
pass
# Feishu OAuth endpoints
try:
if settings.FEISHU_OAUTH is not None:
for k in ("app_access_token_url", "user_access_token_url"):
url_val = settings.FEISHU_OAUTH.get(k)
if url_val:
oauth_urls.add(url_val)
except Exception:
pass
# Defensive normalization: compare only scheme+netloc+path
url_obj = urlparse(url)
for sensitive_url in oauth_urls:
sensitive_obj = urlparse(sensitive_url)
if (url_obj.scheme, url_obj.netloc, url_obj.path) == (sensitive_obj.scheme, sensitive_obj.netloc, sensitive_obj.path):
return True
return False
async def async_request(
method: str,
url: str,
*,
request_timeout: float | httpx.Timeout | None = None,
follow_redirects: bool | None = None,
max_redirects: Optional[int] = None,
headers: Optional[Dict[str, str]] = None,
auth_token: Optional[str] = None,
retries: Optional[int] = None,
backoff_factor: Optional[float] = None,
proxy: Any = None,
**kwargs: Any,
) -> httpx.Response:
"""Lightweight async HTTP wrapper using httpx.AsyncClient with safe defaults."""
timeout = request_timeout if request_timeout is not None else DEFAULT_TIMEOUT
follow_redirects = (
DEFAULT_FOLLOW_REDIRECTS if follow_redirects is None else follow_redirects
)
max_redirects = DEFAULT_MAX_REDIRECTS if max_redirects is None else max_redirects
retries = DEFAULT_MAX_RETRIES if retries is None else max(retries, 0)
backoff_factor = (
DEFAULT_BACKOFF_FACTOR if backoff_factor is None else backoff_factor
)
headers = _clean_headers(headers, auth_token=auth_token)
proxy = DEFAULT_PROXY if proxy is None else proxy
async with httpx.AsyncClient(
timeout=timeout,
follow_redirects=follow_redirects,
max_redirects=max_redirects,
proxy=proxy,
) as client:
last_exc: Exception | None = None
for attempt in range(retries + 1):
try:
start = time.monotonic()
response = await client.request(
method=method, url=url, headers=headers, **kwargs
)
duration = time.monotonic() - start
if not _is_sensitive_url(url):
log_url = _redact_sensitive_url_params(url)
logger.debug(f"async_request {method} {log_url} -> {response.status_code} in {duration:.3f}s")
return response
except httpx.RequestError as exc:
last_exc = exc
if attempt >= retries:
if not _is_sensitive_url(url):
log_url = _redact_sensitive_url_params(url)
feat(api/utils): Harden file_utils for robustness and edge cases (#12915) ## Summary Improves robustness and edge-case handling in `api.utils.file_utils` to avoid crashes, DoS/OOM risks, and timeouts when processing user-provided filenames, paths, and file blobs. ## Changes ### Resource limits & timeouts - **`MAX_BLOB_SIZE_THUMBNAIL`** (50 MiB) and **`MAX_BLOB_SIZE_PDF`** (100 MiB) to reject oversized inputs before thumbnail/PDF processing. - **`GHOSTSCRIPT_TIMEOUT_SEC`** (120 s) for `repair_pdf_with_ghostscript` subprocess to avoid hangs on malicious or broken PDFs. ### `filename_type` - Handles `None`, empty string, non-string (e.g. int/list), and path-only input via new **`_normalize_filename_for_type()`**. - Uses basename for type detection (e.g. `a/b/c.pdf` → PDF). - Enforces **`FILE_NAME_LEN_LIMIT`**; invalid input returns `FileType.OTHER`. ### `thumbnail_img` - Rejects `None`/empty/oversized blob and invalid filename; returns `None` instead of raising. - Wraps PDF, image, and PPT handling in try/except so corrupt or malformed files return `None`. - Ensures PDF has pages and PPT has slides before use. - Normalizes PIL image mode (RGBA/P/LA → RGB) for safe PNG export. ### `repair_pdf_with_ghostscript` - Handles `None`/empty input; skips repair when input size exceeds limit. - Uses `subprocess.run(..., timeout=GHOSTSCRIPT_TIMEOUT_SEC)` and catches `TimeoutExpired`. - Returns original bytes when Ghostscript output is empty. ### `read_potential_broken_pdf` - `None` → `b""`; non–sequence-like (no `len`) → `b""`; empty → return as-is. - Oversized blob returned as-is (no repair) to avoid DoS. ### `sanitize_path` - Explicit `None` and non-string check; strips whitespace before normalizing. ## Testing - **`test/unit_test/utils/test_api_file_utils.py`** added with 36 unit tests covering the above behavior (filename_type, sanitize_path, read_potential_broken_pdf, thumbnail_img, thumbnail, repair_pdf_with_ghostscript, constants). - All tests pass. --------- Co-authored-by: Gittensor Miner <miner@gittensor.io>
2026-02-25 01:34:47 -05:00
logger.warning(f"async_request exhausted retries for {method} {log_url}")
raise
delay = _get_delay(backoff_factor, attempt)
if not _is_sensitive_url(url):
log_url = _redact_sensitive_url_params(url)
logger.warning(
feat(api/utils): Harden file_utils for robustness and edge cases (#12915) ## Summary Improves robustness and edge-case handling in `api.utils.file_utils` to avoid crashes, DoS/OOM risks, and timeouts when processing user-provided filenames, paths, and file blobs. ## Changes ### Resource limits & timeouts - **`MAX_BLOB_SIZE_THUMBNAIL`** (50 MiB) and **`MAX_BLOB_SIZE_PDF`** (100 MiB) to reject oversized inputs before thumbnail/PDF processing. - **`GHOSTSCRIPT_TIMEOUT_SEC`** (120 s) for `repair_pdf_with_ghostscript` subprocess to avoid hangs on malicious or broken PDFs. ### `filename_type` - Handles `None`, empty string, non-string (e.g. int/list), and path-only input via new **`_normalize_filename_for_type()`**. - Uses basename for type detection (e.g. `a/b/c.pdf` → PDF). - Enforces **`FILE_NAME_LEN_LIMIT`**; invalid input returns `FileType.OTHER`. ### `thumbnail_img` - Rejects `None`/empty/oversized blob and invalid filename; returns `None` instead of raising. - Wraps PDF, image, and PPT handling in try/except so corrupt or malformed files return `None`. - Ensures PDF has pages and PPT has slides before use. - Normalizes PIL image mode (RGBA/P/LA → RGB) for safe PNG export. ### `repair_pdf_with_ghostscript` - Handles `None`/empty input; skips repair when input size exceeds limit. - Uses `subprocess.run(..., timeout=GHOSTSCRIPT_TIMEOUT_SEC)` and catches `TimeoutExpired`. - Returns original bytes when Ghostscript output is empty. ### `read_potential_broken_pdf` - `None` → `b""`; non–sequence-like (no `len`) → `b""`; empty → return as-is. - Oversized blob returned as-is (no repair) to avoid DoS. ### `sanitize_path` - Explicit `None` and non-string check; strips whitespace before normalizing. ## Testing - **`test/unit_test/utils/test_api_file_utils.py`** added with 36 unit tests covering the above behavior (filename_type, sanitize_path, read_potential_broken_pdf, thumbnail_img, thumbnail, repair_pdf_with_ghostscript, constants). - All tests pass. --------- Co-authored-by: Gittensor Miner <miner@gittensor.io>
2026-02-25 01:34:47 -05:00
f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {log_url}; retrying in {delay:.2f}s"
)
await asyncio.sleep(delay)
raise last_exc # pragma: no cover
def sync_request(
method: str,
url: str,
*,
timeout: float | httpx.Timeout | None = None,
follow_redirects: bool | None = None,
max_redirects: Optional[int] = None,
headers: Optional[Dict[str, str]] = None,
auth_token: Optional[str] = None,
retries: Optional[int] = None,
backoff_factor: Optional[float] = None,
proxy: Any = None,
**kwargs: Any,
) -> httpx.Response:
"""Synchronous counterpart to async_request, for CLI/tests or sync contexts."""
timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
follow_redirects = (
DEFAULT_FOLLOW_REDIRECTS if follow_redirects is None else follow_redirects
)
max_redirects = DEFAULT_MAX_REDIRECTS if max_redirects is None else max_redirects
retries = DEFAULT_MAX_RETRIES if retries is None else max(retries, 0)
backoff_factor = (
DEFAULT_BACKOFF_FACTOR if backoff_factor is None else backoff_factor
)
headers = _clean_headers(headers, auth_token=auth_token)
proxy = DEFAULT_PROXY if proxy is None else proxy
with httpx.Client(
timeout=timeout,
follow_redirects=follow_redirects,
max_redirects=max_redirects,
proxy=proxy,
) as client:
last_exc: Exception | None = None
for attempt in range(retries + 1):
try:
start = time.monotonic()
response = client.request(
method=method, url=url, headers=headers, **kwargs
)
duration = time.monotonic() - start
logger.debug(
f"sync_request {method} {url} -> {response.status_code} in {duration:.3f}s"
)
return response
except httpx.RequestError as exc:
last_exc = exc
if attempt >= retries:
logger.warning(
f"sync_request exhausted retries for {method} {url}: {exc}"
)
raise
delay = _get_delay(backoff_factor, attempt)
logger.warning(
f"sync_request attempt {attempt + 1}/{retries + 1} failed for {method} {url}: {exc}; retrying in {delay:.2f}s"
)
time.sleep(delay)
raise last_exc # pragma: no cover
__all__ = [
"async_request",
"sync_request",
"DEFAULT_TIMEOUT",
"DEFAULT_FOLLOW_REDIRECTS",
"DEFAULT_MAX_REDIRECTS",
"DEFAULT_MAX_RETRIES",
"DEFAULT_BACKOFF_FACTOR",
"DEFAULT_PROXY",
"DEFAULT_USER_AGENT",
]