# # Copyright 2025 The InfiniFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # """SoMark document parser adapter. Bridges RAGFlow's PDF parsing pipeline to the SoMark async HTTP API. Submits a PDF, polls the async task until completion, then maps SoMark's structured JSON blocks into the (text, layout_type, line_tag) triples that RAGFlow's downstream chunker expects. """ import json import logging import os import random import re import sys import tempfile import threading import time from io import BytesIO from os import PathLike from pathlib import Path from typing import Callable, Optional import numpy as np import pdfplumber import requests from PIL import Image from enum import StrEnum from deepdoc.parser.pdf_parser import RAGFlowPdfParser from deepdoc.parser.utils import extract_pdf_outlines from common.constants import MAXIMUM_PAGE_NUMBER LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" if LOCK_KEY_pdfplumber not in sys.modules: sys.modules[LOCK_KEY_pdfplumber] = threading.Lock() class SoMarkBlockType(StrEnum): """All block.type values returned by SoMark JSON output.""" TEXT = "text" TITLE = "title" FIGURE = "figure" FIGURE_CAPTION = "figure_caption" TABLE = "table" TABLE_CAPTION = "table_caption" HEADER = "header" FOOTER = "footer" FOOTNOTE = "footnote" SIDER = "sider" CATE = "cate" CATE_ITEM = "cate_item" CODE = "code" CHOICE = "choice" BLANK = "blank" QRCODE = "qrcode" STAMP = "stamp" REFERENCE = "reference" EQUATION = "equation" CS = "cs" CS_EQUATION = "cs_equation" # Map each SoMark type to RAGFlow's internal layout type. # Internal types used downstream: text / table / image / equation / code / discarded. SOMARK_TYPE_TO_RAGFLOW = { SoMarkBlockType.TEXT: "text", SoMarkBlockType.TITLE: "text", SoMarkBlockType.FIGURE: "image", SoMarkBlockType.FIGURE_CAPTION: "text", SoMarkBlockType.TABLE: "table", SoMarkBlockType.TABLE_CAPTION: "text", SoMarkBlockType.FOOTNOTE: "text", SoMarkBlockType.SIDER: "text", SoMarkBlockType.CODE: "code", SoMarkBlockType.CHOICE: "text", SoMarkBlockType.REFERENCE: "text", SoMarkBlockType.EQUATION: "equation", SoMarkBlockType.CS: "image", SoMarkBlockType.CS_EQUATION: "text", SoMarkBlockType.QRCODE: "image", SoMarkBlockType.STAMP: "image", # header/footer resolved at runtime based on keep_header_footer flag. # cate/cate_item/blank are always discarded. } # Block types that are always dropped (TOC noise, empty form fields). ALWAYS_DISCARDED = { SoMarkBlockType.CATE, SoMarkBlockType.CATE_ITEM, SoMarkBlockType.BLANK, } class SoMarkAPIError(RuntimeError): """Raised when SoMark API returns a non-zero ``code`` or HTTP failure.""" class SoMarkParser(RAGFlowPdfParser): """Parse a PDF via SoMark's async HTTP API and convert blocks to RAGFlow sections.""" SUBMIT_PATH = "/parse/async" CHECK_PATH = "/parse/async_check" USAGE_PATH = "/usage" # /usage quota check only works in SaaS; private deployments fall back # to a generic HEAD health check. SAAS_BASE_URL = "https://somark.tech/api/v1" USAGE_REQUEST_TIMEOUT = 10 # /usage request timeout # SoMark error codes QPS_LIMIT_CODE = 1124 # rate limited; retry with backoff when hit during submission INVALID_API_KEY_CODE = 1107 # returned by /usage check when API key is invalid # Submission phase: retry "concurrency slots full" rejections within a fixed budget SUBMIT_BUDGET_SECONDS = 10 * 60 # total submission retry budget (10 min) SUBMIT_BACKOFF_BASE_SECONDS = 1.0 # initial backoff interval SUBMIT_BACKOFF_MAX_SECONDS = 10.0 # max single backoff interval SUBMIT_BACKOFF_JITTER_SECONDS = 0.5 # jitter to avoid thundering herd from concurrent callers SUBMIT_REQUEST_TIMEOUT = 60 # single submit request timeout # Polling phase: keep querying task status until success / failure / budget exhausted POLL_BUDGET_SECONDS = 10 * 60 # max time to wait for a single task POLL_INTERVAL_BASE_SECONDS = 2.0 # initial polling interval POLL_INTERVAL_MAX_SECONDS = 10.0 # max polling interval for long-running tasks POLL_INTERVAL_GROWTH = 1.5 # multiplier applied after each poll POLL_REQUEST_TIMEOUT = 30 # single poll request timeout def __init__( self, base_url: str, api_key: str = "", *, image_format: str = "url", formula_format: str = "latex", table_format: str = "html", cs_format: str = "image", enable_text_cross_page: bool = False, enable_table_cross_page: bool = False, enable_title_level_recognition: bool = False, enable_inline_image: bool = False, enable_table_image: bool = True, enable_image_understanding: bool = True, keep_header_footer: bool = False, ): self.base_url = base_url.strip().rstrip("/") # Intentionally NOT stripping: caller may want to pass raw key as-is # (e.g. for verification where whitespace would also be reported back). self.api_key = api_key self.element_formats = { "image": image_format.strip().lower(), "formula": formula_format.strip().lower(), "table": table_format.strip().lower(), "cs": cs_format.strip().lower(), } self.feature_config = { "enable_text_cross_page": bool(enable_text_cross_page), "enable_table_cross_page": bool(enable_table_cross_page), "enable_title_level_recognition": bool(enable_title_level_recognition), "enable_inline_image": bool(enable_inline_image), "enable_table_image": bool(enable_table_image), "enable_image_understanding": bool(enable_image_understanding), "keep_header_footer": bool(keep_header_footer), } self.outlines: list = [] self.logger = logging.getLogger(self.__class__.__name__) # --------------------------------------------------------------------- # Reachability check # --------------------------------------------------------------------- @staticmethod def _is_http_endpoint_valid(url: str, timeout: int = 5) -> bool: try: response = requests.head(url, timeout=timeout, allow_redirects=True) return response.status_code < 500 except Exception: return False def check_installation(self) -> tuple[bool, str]: if not self.base_url: return False, "[SoMark] SOMARK_BASE_URL not configured." if not self.base_url.startswith(("http://", "https://")): return False, "[SoMark] SOMARK_BASE_URL must start with http:// or https://." # SaaS deployment: hit /usage to verify API key validity and remaining quota. if self.base_url == self.SAAS_BASE_URL: return self._check_saas_usage() # Private deployment: use a cheap HEAD health check. if not self._is_http_endpoint_valid(self.base_url): return False, f"[SoMark] server unreachable: {self.base_url}" return True, "" def _check_saas_usage(self) -> tuple[bool, str]: """Verify api_key and remaining quota against the hosted SoMark service. Treats two specific business outcomes as user-facing errors: - ``code == 1107``: invalid API key. - ``code == 0`` but both ``remaining_paid_pages`` and ``remaining_free_pages_this_month`` are 0: out of parse quota. """ url = f"{self.base_url}{self.USAGE_PATH}" data = self._auth_field() try: resp = requests.post(url, data=data, timeout=self.USAGE_REQUEST_TIMEOUT) except requests.RequestException as exc: return False, f"[SoMark] usage check failed: {exc}" if resp.status_code >= 500: return False, (f"[SoMark] usage HTTP {resp.status_code}: {resp.text[:200]}") try: body = resp.json() except ValueError: return False, (f"[SoMark] usage non-JSON response ({resp.status_code}): {resp.text[:200]}") code = body.get("code") message = body.get("message") or "" if code == self.INVALID_API_KEY_CODE: return False, f"[SoMark] {message or 'Invalid API key'}" if code != 0: return False, f"[SoMark] usage error code={code} message={message}" usage = body.get("data") or {} paid = usage.get("remaining_paid_pages") or 0 free = usage.get("remaining_free_pages_this_month") or 0 if paid == 0 and free == 0: return False, ("[SoMark] insufficient parse pages (remaining_paid_pages=0, remaining_free_pages_this_month=0)") return True, "" # --------------------------------------------------------------------- # HTTP helpers # --------------------------------------------------------------------- def _auth_field(self) -> dict: """Return the api_key multipart field if configured; empty dict otherwise. SoMark's hosted API requires ``api_key``. Local deployments reject the field outright, so we omit it entirely when blank. """ return {"api_key": self.api_key} if self.api_key else {} def _submit_task(self, pdf_path: Path, callback: Optional[Callable] = None) -> str: url = f"{self.base_url}{self.SUBMIT_PATH}" data = { "output_formats": ["json"], "element_formats": json.dumps(self.element_formats, ensure_ascii=False), "feature_config": json.dumps(self.feature_config, ensure_ascii=False), } data.update(self._auth_field()) safe_keys = [k for k in data if k != "api_key"] self.logger.info(f"[SoMark] submit {url} keys={safe_keys} has_api_key={bool(self.api_key)}") if callback: callback(0.20, f"[SoMark] submitting task to {url}") deadline = time.monotonic() + self.SUBMIT_BUDGET_SECONDS attempt = 0 while True: try: with open(pdf_path, "rb") as fh: files = {"file": (pdf_path.name, fh, "application/pdf")} # multipart fields with list values must be sent as repeated tuples form_data = [] for key, value in data.items(): if isinstance(value, list): for v in value: form_data.append((key, str(v))) else: form_data.append((key, str(value))) resp = requests.post( url, files=files, data=form_data, timeout=self.SUBMIT_REQUEST_TIMEOUT, ) except requests.RequestException as exc: raise SoMarkAPIError(f"[SoMark] submit failed: {exc}") from exc # Inline parsing so the QPS-limit code can be distinguished from # other business errors before raising. if resp.status_code >= 500: raise SoMarkAPIError(f"[SoMark] submit HTTP {resp.status_code}: {resp.text[:200]}") try: body = resp.json() except ValueError as exc: raise SoMarkAPIError(f"[SoMark] submit non-JSON response ({resp.status_code}): {resp.text[:200]}") from exc code = body.get("code") if code == 0: task_id = (body.get("data") or {}).get("task_id") if not task_id: raise SoMarkAPIError(f"[SoMark] submit returned no task_id: {body}") self.logger.info(f"[SoMark] task submitted, task_id={task_id} attempts={attempt + 1}") return task_id # QPS / concurrency rejection: exponential backoff within budget. if code == self.QPS_LIMIT_CODE: backoff = min( self.SUBMIT_BACKOFF_BASE_SECONDS * (2**attempt), self.SUBMIT_BACKOFF_MAX_SECONDS, ) wait = backoff + random.random() * self.SUBMIT_BACKOFF_JITTER_SECONDS if time.monotonic() + wait > deadline: raise SoMarkAPIError("[SoMark] submit blocked by QPS limit; retry budget exhausted") self.logger.info( "[SoMark] submit hit QPS limit, retrying in %.2fs (attempt=%d)", wait, attempt + 1, ) if callback: callback( 0.20, f"[SoMark] busy (QPS limit), backing off {wait:.2f}s before retry", ) time.sleep(wait) attempt += 1 continue # Any other non-zero code: not retryable. raise SoMarkAPIError(f"[SoMark] submit business error code={code} message={body.get('message')}") def _poll_task(self, task_id: str, callback: Optional[Callable] = None) -> dict: url = f"{self.base_url}{self.CHECK_PATH}" deadline = time.monotonic() + self.POLL_BUDGET_SECONDS interval = self.POLL_INTERVAL_BASE_SECONDS started_at = time.monotonic() attempt = 0 while time.monotonic() < deadline: # Sleep first: the task was just submitted, an immediate poll is wasteful. time.sleep(interval) attempt += 1 data = {"task_id": task_id} data.update(self._auth_field()) try: resp = requests.post(url, data=data, timeout=self.POLL_REQUEST_TIMEOUT) except requests.RequestException as exc: raise SoMarkAPIError(f"[SoMark] poll request failed: {exc}") from exc body = self._parse_json_body(resp, "poll") payload = body.get("data") or {} status = payload.get("status") elapsed = time.monotonic() - started_at if status == "SUCCESS": self.logger.info(f"[SoMark] task {task_id} completed after {attempt} poll(s) in {elapsed:.1f}s") result = payload.get("result") if not result: raise SoMarkAPIError(f"[SoMark] SUCCESS but no result: {body}") return result if status == "FAILED": raise SoMarkAPIError(f"[SoMark] task {task_id} FAILED: {body.get('message')}") if callback and attempt % 5 == 0: callback( 0.40, f"[SoMark] still {status}, polled {attempt} time(s) (elapsed={elapsed:.1f}s, next in {interval:.1f}s)", ) interval = min( interval * self.POLL_INTERVAL_GROWTH, self.POLL_INTERVAL_MAX_SECONDS, ) raise SoMarkAPIError(f"[SoMark] task {task_id} timed out after {self.POLL_BUDGET_SECONDS}s while waiting") @staticmethod def _parse_json_body(resp: requests.Response, stage: str) -> dict: if resp.status_code >= 500: raise SoMarkAPIError(f"[SoMark] {stage} HTTP {resp.status_code}: {resp.text[:200]}") try: body = resp.json() except ValueError as exc: raise SoMarkAPIError(f"[SoMark] {stage} non-JSON response ({resp.status_code}): {resp.text[:200]}") from exc code = body.get("code") if code != 0: raise SoMarkAPIError(f"[SoMark] {stage} business error code={code} message={body.get('message')}") return body # --------------------------------------------------------------------- # Page image rendering # --------------------------------------------------------------------- def __images__( self, fnm, zoomin: int = 1, page_from: int = 0, page_to: int = MAXIMUM_PAGE_NUMBER, callback=None, ): self.page_from = page_from self.page_to = page_to try: ctx = pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) with ctx as pdf: self.pdf = pdf self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])] except Exception as exc: self.page_images = None self.total_page = 0 self.logger.exception(exc) # --------------------------------------------------------------------- # Position tagging (compatible with RAGFlow's extract_positions/crop) # --------------------------------------------------------------------- def _line_tag(self, bx: dict) -> str: """Build a ``@@page\\tx0\\tx1\\ty0\\ty1##`` tag. bx requires keys: ``page_idx`` (0-based), ``bbox`` ([x1,y1,x2,y2] in SoMark's reported pixel coordinates), ``page_size`` ({h,w}). """ page_idx = bx.get("page_idx", 0) pn = [page_idx + 1] bbox = bx.get("bbox") or (0, 0, 0, 0) if len(bbox) != 4: bbox = (0, 0, 0, 0) x0, top, x1, bott = bbox page_size = bx.get("page_size") or {} src_w = page_size.get("w") or 1 src_h = page_size.get("h") or 1 if x0 > x1: x0, x1 = x1, x0 if top > bott: top, bott = bott, top if hasattr(self, "page_images") and self.page_images and len(self.page_images) > page_idx: page_width, page_height = self.page_images[page_idx].size x0 = (x0 / src_w) * page_width x1 = (x1 / src_w) * page_width top = (top / src_h) * page_height bott = (bott / src_h) * page_height return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join(str(p) for p in pn), x0, x1, top, bott) @staticmethod def extract_positions(txt: str): poss = [] for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt): pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t") left, right, top, bottom = ( float(left), float(right), float(top), float(bottom), ) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) return poss def crop(self, text, ZM=1, need_position=False): """Image crop based on tags.""" imgs = [] poss = self.extract_positions(text) if not poss: return (None, None) if need_position else None if not getattr(self, "page_images", None): self.logger.warning("[SoMark] crop called without page images; skip.") return (None, None) if need_position else None page_count = len(self.page_images) filtered = [] for pns, left, right, top, bottom in poss: valid_pns = [p for p in pns if 0 <= p < page_count] if valid_pns: filtered.append((valid_pns, left, right, top, bottom)) if not filtered: return (None, None) if need_position else None poss = filtered max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) GAP = 6 first = poss[0] poss.insert( 0, ( [first[0][0]], first[1], first[2], max(0, first[3] - 120), max(first[3] - GAP, 0), ), ) last = poss[-1] last_pn = last[0][-1] if not (0 <= last_pn < page_count): return (None, None) if need_position else None last_h = self.page_images[last_pn].size[1] poss.append( ( [last_pn], last[1], last[2], min(last_h, last[4] + GAP), min(last_h, last[4] + 120), ) ) positions = [] for ii, (pns, left, right, top, bottom) in enumerate(poss): right = left + max_width if bottom <= top: bottom = top + 2 for pn in pns[1:]: if 0 <= pn - 1 < page_count: bottom += self.page_images[pn - 1].size[1] if not (0 <= pns[0] < page_count): continue base_img = self.page_images[pns[0]] x0, y0, x1, y1 = ( int(left), int(top), int(right), int(min(bottom, base_img.size[1])), ) if x0 > x1: x0, x1 = x1, x0 if y0 > y1: y0, y1 = y1, y0 if x1 <= x0 or y1 <= y0: continue imgs.append(base_img.crop((x0, y0, x1, y1))) if 0 < ii < len(poss) - 1: positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) bottom -= base_img.size[1] for pn in pns[1:]: if not (0 <= pn < page_count): continue page = self.page_images[pn] x0, y0, x1, y1 = ( int(left), 0, int(right), int(min(bottom, page.size[1])), ) if x0 > x1: x0, x1 = x1, x0 if y0 > y1: y0, y1 = y1, y0 if x1 <= x0 or y1 <= y0: bottom -= page.size[1] continue imgs.append(page.crop((x0, y0, x1, y1))) if 0 < ii < len(poss) - 1: positions.append((pn + self.page_from, x0, x1, y0, y1)) bottom -= page.size[1] if not imgs: return (None, None) if need_position else None height = sum(img.size[1] + GAP for img in imgs) width = int(np.max([i.size[0] for i in imgs])) pic = Image.new("RGB", (width, int(height)), (245, 245, 245)) offset = 0 for ii, img in enumerate(imgs): if ii == 0 or ii + 1 == len(imgs): img = img.convert("RGBA") overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) overlay.putalpha(128) img = Image.alpha_composite(img, overlay).convert("RGB") pic.paste(img, (0, int(offset))) offset += img.size[1] + GAP return (pic, positions) if need_position else pic # --------------------------------------------------------------------- # SoMark JSON -> RAGFlow sections # --------------------------------------------------------------------- def _resolve_internal_type(self, block_type: str) -> Optional[str]: """Resolve the RAGFlow internal layout type, or ``None`` to discard. Header/footer obey ``keep_header_footer``; cate/cate_item/blank always drop. Unknown SoMark types fall back to ``text`` to avoid silent loss. """ if block_type in ALWAYS_DISCARDED: return None if block_type == SoMarkBlockType.HEADER or block_type == SoMarkBlockType.FOOTER: return "text" if self.feature_config.get("keep_header_footer") else None return SOMARK_TYPE_TO_RAGFLOW.get(block_type, "text") @staticmethod def _block_text(block: dict, internal_type: str) -> str: """Extract the textual payload for a block. ``image``-typed blocks contribute no text (image only); everything else falls back to ``content``. For ``title`` blocks with title_level we prepend markdown-style hashes so downstream tokenization preserves hierarchy. """ if internal_type == "image": return "" content = (block.get("content") or "").strip() if block.get("type") == SoMarkBlockType.TITLE.value: level = block.get("title_level") if isinstance(level, int) and 1 <= level <= 6: content = ("#" * level) + " " + content return content def _transfer_to_sections(self, pages: list[dict], parse_method: Optional[str] = None) -> list[tuple]: # MinerU contract: manual/pipeline callers (the rag/flow DAG) want typed # 3-tuples ``(text, layout_type, line_tag)`` so the consumer can set # box["layout_type"] and crop via the separate position field; every other # caller (naive.py standard chunking) wants 2-tuples ``(text, line_tag)`` # that naive_merge consumes directly. typed = parse_method in {"manual", "pipeline"} sections: list[tuple] = [] image_seq = 0 for page in pages or []: page_idx = page.get("page_num", 0) page_size = page.get("page_size") or {} for block in page.get("blocks") or []: btype = block.get("type") internal = self._resolve_internal_type(btype) if internal is None: continue # Inject page_idx so _line_tag can compute coords. bbox = block.get("bbox") tag_input = { "page_idx": page_idx, "bbox": bbox, "page_size": page_size, } if internal == "image": # Align with MinerU: the figure is recovered by cropping the # locally rendered page region via crop(), not from img_url. if not bbox or len(bbox) != 4: continue # no geometry -> nothing to crop line_tag = self._line_tag(tag_input) image_seq += 1 caption = (block.get("content") or "").strip() label = caption or f"{btype} {image_seq}" if typed: # 3-tuple: layout_type + a real (separate) position field; # keep the caption in text so figure understanding can be # embedded and retrieved while crop() still uses line_tag. sections.append((label, internal, line_tag)) else: # 2-tuple (naive.py): the chunk id is hash(content + doc_id), # so an empty-text image chunk would collide across every # figure and all but one would be dropped on upsert. Give it a # non-empty, unique caption (SoMark image-understanding text if # present, else " ") so each figure gets a distinct # id. The tag is appended so tokenize_chunks() -> crop() can # still recover the figure; remove_tag() then strips it, leaving # the caption as the chunk text. sections.append((f"{label}{line_tag}", "")) continue text = self._block_text(block, internal) if not text: continue line_tag = self._line_tag(tag_input) if typed: sections.append((text, internal, line_tag)) else: sections.append((text, line_tag)) return sections def _transfer_to_tables(self, pages: list[dict]) -> list: # Tables are inlined as HTML in section text; no separate table extraction. return [] # --------------------------------------------------------------------- # Public entry point # --------------------------------------------------------------------- def parse_pdf( self, filepath: str | PathLike[str], binary: BytesIO | bytes | None = None, callback: Optional[Callable] = None, parse_method: Optional[str] = None, **kwargs, ) -> tuple: self.outlines = extract_pdf_outlines(binary if binary is not None else filepath) # Normalize input to a real PDF file on disk. temp_pdf: Optional[Path] = None if binary: tmp_dir = Path(tempfile.mkdtemp(prefix="somark_bin_pdf_")) file_name = Path(filepath).stem.replace(" ", "") + ".pdf" temp_pdf = tmp_dir / file_name with open(temp_pdf, "wb") as f: f.write(binary.getvalue() if isinstance(binary, BytesIO) else binary) pdf_path = temp_pdf else: pdf_path = Path(filepath) if not pdf_path.exists(): if callback: callback(-1, f"[SoMark] PDF not found: {pdf_path}") raise FileNotFoundError(f"[SoMark] PDF not found: {pdf_path}") if callback: callback(0.10, f"[SoMark] using {pdf_path.name}") # Render page images locally so _line_tag/crop can map bbox to pixels. self.__images__(pdf_path, zoomin=1) try: ok, reason = self.check_installation() if not ok: raise SoMarkAPIError(reason) task_id = self._submit_task(pdf_path, callback=callback) result = self._poll_task(task_id, callback=callback) outputs = (result or {}).get("outputs") or {} json_payload = outputs.get("json") or {} pages = json_payload.get("pages") or [] if not pages: self.logger.warning("[SoMark] empty pages in response; nothing to chunk") if callback: callback( 0.75, f"[SoMark] parsed {sum(len(p.get('blocks') or []) for p in pages)} blocks across {len(pages)} pages", ) sections = self._transfer_to_sections(pages, parse_method) tables = self._transfer_to_tables(pages) return sections, tables finally: if temp_pdf and temp_pdf.exists(): try: temp_pdf.unlink() temp_pdf.parent.rmdir() except Exception: pass if __name__ == "__main__": parser = SoMarkParser( base_url=os.environ.get("SOMARK_BASE_URL", "https://somark.tech/api/v1"), api_key=os.environ.get("SOMARK_API_KEY", ""), ) ok, reason = parser.check_installation() print("SoMark available:", ok, reason)