diff --git a/Dockerfile b/Dockerfile
index b89cb266a0..fdc5f4c4bb 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,7 +42,8 @@ RUN --mount=type=cache,id=ragflow_apt,target=/var/cache/apt,sharing=locked \
echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache && \
chmod 1777 /tmp && \
apt update && \
- apt install -y build-essential libglib2.0-0 libglx-mesa0 libgl1 pkg-config libicu-dev libgdiplus default-jdk libatk-bridge2.0-0 libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev libjemalloc-dev gnupg unzip curl wget git vim less ghostscript pandoc texlive fonts-freefont-ttf fonts-noto-cjk postgresql-client
+ apt install -y \
+ build-essential libglib2.0-0 libglx-mesa0 libgl1 pkg-config libicu-dev libgdiplus default-jdk libatk-bridge2.0-0 libpython3-dev libgtk-4-1 libnss3 xdg-utils libgbm-dev libjemalloc-dev gnupg unzip curl wget git vim less ghostscript pandoc texlive texlive-latex-extra texlive-xetex texlive-lang-chinese fonts-freefont-ttf fonts-noto-cjk postgresql-client
# Download resource from GitHub to /usr/share/infinity
RUN mkdir -p /usr/share/infinity/resource && \
diff --git a/agent/component/docs_generator.py b/agent/component/docs_generator.py
index a3f165a552..3ab02c4cda 100644
--- a/agent/component/docs_generator.py
+++ b/agent/component/docs_generator.py
@@ -1,1570 +1,632 @@
+import logging
import json
import os
import re
-import base64
-from datetime import datetime
+import shutil
+import tempfile
from abc import ABC
-from io import BytesIO
-from typing import Optional
+from datetime import datetime
from functools import partial
-from reportlab.lib.pagesizes import A4
-from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
-from reportlab.lib.units import inch
-from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image, TableStyle, LongTable
-from reportlab.lib import colors
-from reportlab.pdfbase import pdfmetrics
-from reportlab.pdfbase.ttfonts import TTFont
-from reportlab.pdfbase.cidfonts import UnicodeCIDFont
+from io import BytesIO
+from xml.sax.saxutils import escape
from agent.component.base import ComponentParamBase
from api.utils.api_utils import timeout
+from common import settings
+from common.misc_utils import get_uuid
from .message import Message
-class PDFGeneratorParam(ComponentParamBase):
+def sanitize_filename(name: str, extension: str) -> str:
+ if not name:
+ return f"file.{extension}"
+
+ name = str(name).strip()
+ name = re.sub(r'[\\/\x00-\x1f\?\#\%\*\:\|\<\>"]', " ", name)
+ name = re.sub(r"\s+", " ", name).strip(" .")
+
+ if not name:
+ return f"file.{extension}"
+
+ base, _ = os.path.splitext(name)
+ base = base[:180].rstrip() or "file"
+ return f"{base}.{extension}"
+
+
+class DocGeneratorParam(ComponentParamBase):
"""
- Define the PDF Generator component parameters.
+ Define the Docs Generator component parameters.
"""
def __init__(self):
super().__init__()
- # Output format
- self.output_format = "pdf" # pdf, docx, txt
-
- # Content inputs
+ self.output_format = "pdf" # pdf, docx, txt, markdown, html
self.content = ""
- self.title = ""
- self.subtitle = ""
+ self.filename = ""
self.header_text = ""
self.footer_text = ""
-
- # Images
- self.logo_image = "" # base64 or file path
- self.logo_position = "left" # left, center, right
- self.logo_width = 2.0 # inches
- self.logo_height = 1.0 # inches
-
- # Styling
- self.font_family = "Helvetica" # Helvetica, Times-Roman, Courier
- self.font_size = 12
- self.title_font_size = 24
- self.heading1_font_size = 18
- self.heading2_font_size = 16
- self.heading3_font_size = 14
- self.text_color = "#000000"
- self.title_color = "#000000"
-
- # Page settings
- self.page_size = "A4"
- self.orientation = "portrait" # portrait, landscape
- self.margin_top = 1.0 # inches
- self.margin_bottom = 1.0
- self.margin_left = 1.0
- self.margin_right = 1.0
- self.line_spacing = 1.2
-
- # Output settings
- self.filename = ""
- self.output_directory = "/tmp/pdf_outputs"
+ self.watermark_text = ""
self.add_page_numbers = True
self.add_timestamp = True
-
- # Advanced features
- self.watermark_text = ""
- self.enable_toc = False
-
+ self.font_size = 12
self.outputs = {
- "file_path": {"value": "", "type": "string"},
- "pdf_base64": {"value": "", "type": "string"},
"download": {"value": "", "type": "string"},
- "success": {"value": False, "type": "boolean"}
}
def check(self):
- self.check_empty(self.content, "[PDFGenerator] Content")
- self.check_valid_value(self.output_format, "[PDFGenerator] Output format", ["pdf", "docx", "txt"])
- self.check_valid_value(self.logo_position, "[PDFGenerator] Logo position", ["left", "center", "right"])
- self.check_valid_value(self.font_family, "[PDFGenerator] Font family",
- ["Helvetica", "Times-Roman", "Courier", "Helvetica-Bold", "Times-Bold"])
- self.check_valid_value(self.page_size, "[PDFGenerator] Page size", ["A4", "Letter"])
- self.check_valid_value(self.orientation, "[PDFGenerator] Orientation", ["portrait", "landscape"])
- self.check_positive_number(self.font_size, "[PDFGenerator] Font size")
- self.check_positive_number(self.margin_top, "[PDFGenerator] Margin top")
+ self.check_empty(self.content, "[DocGenerator] Content")
+ self.check_valid_value(
+ self.output_format,
+ "[DocGenerator] Output format",
+ ["pdf", "docx", "txt", "markdown", "html"],
+ )
+ self.check_positive_number(self.font_size, "[DocGenerator] Font size")
+ if self.font_size < 12:
+ raise ValueError("[DocGenerator] Font size must be greater than or equal to 12")
-class PDFGenerator(Message, ABC):
- component_name = "PDFGenerator"
-
- # Track if Unicode fonts have been registered
- _unicode_fonts_registered = False
- _unicode_font_name = None
- _unicode_font_bold_name = None
-
- @classmethod
- def _reset_font_cache(cls):
- """Reset font registration cache - useful for testing"""
- cls._unicode_fonts_registered = False
- cls._unicode_font_name = None
- cls._unicode_font_bold_name = None
-
- @classmethod
- def _register_unicode_fonts(cls):
- """Register Unicode-compatible fonts for multi-language support.
-
- Uses CID fonts (STSong-Light) for reliable CJK rendering as TTF fonts
- have issues with glyph mapping in some ReportLab versions.
- """
- # If already registered successfully, return True
- if cls._unicode_fonts_registered and cls._unicode_font_name is not None:
- return True
-
- # Reset and try again if previous registration failed
- cls._unicode_fonts_registered = True
- cls._unicode_font_name = None
- cls._unicode_font_bold_name = None
-
- # Use CID fonts for reliable CJK support
- # These are built into ReportLab and work reliably across all platforms
- cid_fonts = [
- 'STSong-Light', # Simplified Chinese
- 'HeiseiMin-W3', # Japanese
- 'HYSMyeongJo-Medium', # Korean
- ]
-
- for cid_font in cid_fonts:
- try:
- pdfmetrics.registerFont(UnicodeCIDFont(cid_font))
- cls._unicode_font_name = cid_font
- cls._unicode_font_bold_name = cid_font # CID fonts don't have bold variants
- print(f"Registered CID font: {cid_font}")
- break
- except Exception as e:
- print(f"Failed to register CID font {cid_font}: {e}")
- continue
-
- # If CID fonts fail, try TTF fonts as fallback
- if not cls._unicode_font_name:
- font_paths = [
- '/usr/share/fonts/truetype/freefont/FreeSans.ttf',
- '/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf',
- ]
-
- for font_path in font_paths:
- if os.path.exists(font_path):
- try:
- pdfmetrics.registerFont(TTFont('UnicodeFont', font_path))
- cls._unicode_font_name = 'UnicodeFont'
- cls._unicode_font_bold_name = 'UnicodeFont'
- print(f"Registered TTF font from: {font_path}")
-
- # Register font family
- from reportlab.pdfbase.pdfmetrics import registerFontFamily
- registerFontFamily('UnicodeFont', normal='UnicodeFont', bold='UnicodeFont')
- break
- except Exception as e:
- print(f"Failed to register TTF font {font_path}: {e}")
- continue
-
- return cls._unicode_font_name is not None
-
- @staticmethod
- def _needs_unicode_font(text: str) -> bool:
- """Check if text contains CJK or other complex scripts that need special fonts.
-
- Standard PDF fonts (Helvetica, Times, Courier) support:
- - Basic Latin, Extended Latin, Cyrillic, Greek
-
- CID fonts are needed for:
- - CJK (Chinese, Japanese, Korean)
- - Arabic, Hebrew (RTL scripts)
- - Thai, Hindi, and other Indic scripts
- """
- if not text:
- return False
-
- for char in text:
- code = ord(char)
-
- # CJK Unified Ideographs and related ranges
- if 0x4E00 <= code <= 0x9FFF: # CJK Unified Ideographs
- return True
- if 0x3400 <= code <= 0x4DBF: # CJK Extension A
- return True
- if 0x3000 <= code <= 0x303F: # CJK Symbols and Punctuation
- return True
- if 0x3040 <= code <= 0x309F: # Hiragana
- return True
- if 0x30A0 <= code <= 0x30FF: # Katakana
- return True
- if 0xAC00 <= code <= 0xD7AF: # Hangul Syllables
- return True
- if 0x1100 <= code <= 0x11FF: # Hangul Jamo
- return True
-
- # Arabic and Hebrew (RTL scripts)
- if 0x0600 <= code <= 0x06FF: # Arabic
- return True
- if 0x0590 <= code <= 0x05FF: # Hebrew
- return True
-
- # Indic scripts
- if 0x0900 <= code <= 0x097F: # Devanagari (Hindi)
- return True
- if 0x0E00 <= code <= 0x0E7F: # Thai
- return True
-
- return False
-
- def _get_font_for_content(self, content: str) -> tuple:
- """Get appropriate font based on content, returns (regular_font, bold_font)"""
- if self._needs_unicode_font(content):
- if self._register_unicode_fonts() and self._unicode_font_name:
- return (self._unicode_font_name, self._unicode_font_bold_name or self._unicode_font_name)
- else:
- print("Warning: Content contains non-Latin characters but no Unicode font available")
-
- # Fall back to configured font
- return (self._param.font_family, self._get_bold_font_name())
-
- def _get_active_font(self) -> str:
- """Get the currently active font (Unicode or configured)"""
- return getattr(self, '_active_font', self._param.font_family)
-
- def _get_active_bold_font(self) -> str:
- """Get the currently active bold font (Unicode or configured)"""
- return getattr(self, '_active_bold_font', self._get_bold_font_name())
-
- def _get_bold_font_name(self) -> str:
- """Get the correct bold variant of the current font family"""
- font_map = {
- 'Helvetica': 'Helvetica-Bold',
- 'Times-Roman': 'Times-Bold',
- 'Courier': 'Courier-Bold',
- }
- font_family = getattr(self._param, 'font_family', 'Helvetica')
- if 'Bold' in font_family:
- return font_family
- return font_map.get(font_family, 'Helvetica-Bold')
+class DocGenerator(Message, ABC):
+ component_name = "DocGenerator"
+ _default_output_directory = os.path.join(tempfile.gettempdir(), "doc_outputs")
+ _overlay_margin = 36
+ _overlay_font_size = 9
+ _pdf_main_font = "Noto Sans CJK SC"
+ _pdf_cjk_font = "Noto Sans CJK SC"
+ _pdf_overlay_font = "STSong-Light"
def get_input_form(self) -> dict[str, dict]:
return {
"content": {
"name": "Content",
- "type": "text"
- },
- "title": {
- "name": "Title",
- "type": "line"
- },
- "subtitle": {
- "name": "Subtitle",
- "type": "line"
+ "type": "text",
}
}
- @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)))
+ @timeout(int(os.environ.get("COMPONENT_EXEC_TIMEOUT", 10 * 60)))
def _invoke(self, **kwargs):
- import traceback
-
+ file_path = None
try:
- # Get content from parameters (which may contain variable references)
- content = self._param.content or ""
- title = self._param.title or ""
- subtitle = self._param.subtitle or ""
-
- # Log PDF generation start
- print(f"Starting PDF generation for title: {title}, content length: {len(content)} chars")
-
- # Resolve variable references in content using canvas
- if content and self._canvas.is_reff(content.strip()):
- # Extract the variable reference and get its value
- import re
- matches = re.findall(self.variable_ref_patt, content, flags=re.DOTALL)
- for match in matches:
- try:
- var_value = self._canvas.get_variable_value(match)
- if var_value:
- # Handle partial (streaming) content
- if isinstance(var_value, partial):
- resolved_content = ""
- for chunk in var_value():
- resolved_content += chunk
- content = content.replace("{" + match + "}", resolved_content)
- else:
- content = content.replace("{" + match + "}", str(var_value))
- except Exception as e:
- print(f"Error resolving variable {match}: {str(e)}")
- content = content.replace("{" + match + "}", f"[ERROR: {str(e)}]")
-
- # Also process with get_kwargs for any remaining variables
- if content:
- try:
- content, _ = self.get_kwargs(content, kwargs)
- except Exception as e:
- print(f"Error processing content with get_kwargs: {str(e)}")
-
- # Process template variables in title
- if title and self._canvas.is_reff(title):
- try:
- matches = re.findall(self.variable_ref_patt, title, flags=re.DOTALL)
- for match in matches:
- var_value = self._canvas.get_variable_value(match)
- if var_value:
- title = title.replace("{" + match + "}", str(var_value))
- except Exception as e:
- print(f"Error processing title variables: {str(e)}")
-
- if title:
- try:
- title, _ = self.get_kwargs(title, kwargs)
- except Exception:
- pass
-
- # Process template variables in subtitle
- if subtitle and self._canvas.is_reff(subtitle):
- try:
- matches = re.findall(self.variable_ref_patt, subtitle, flags=re.DOTALL)
- for match in matches:
- var_value = self._canvas.get_variable_value(match)
- if var_value:
- subtitle = subtitle.replace("{" + match + "}", str(var_value))
- except Exception as e:
- print(f"Error processing subtitle variables: {str(e)}")
-
- if subtitle:
- try:
- subtitle, _ = self.get_kwargs(subtitle, kwargs)
- except Exception:
- pass
-
- # If content is still empty, check if it was passed directly
- if not content:
- content = kwargs.get("content", "")
-
- # Generate document based on format
+ content = self._resolve_content(kwargs)
+ output_format = self._param.output_format or "pdf"
+
try:
- output_format = self._param.output_format or "pdf"
-
if output_format == "pdf":
- file_path, doc_base64 = self._generate_pdf(content, title, subtitle)
+ file_path, file_bytes = self._generate_pdf(content)
mime_type = "application/pdf"
elif output_format == "docx":
- file_path, doc_base64 = self._generate_docx(content, title, subtitle)
+ file_path, file_bytes = self._generate_docx(content)
mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
elif output_format == "txt":
- file_path, doc_base64 = self._generate_txt(content, title, subtitle)
+ file_path, file_bytes = self._generate_txt(content)
mime_type = "text/plain"
+ elif output_format == "markdown":
+ file_path, file_bytes = self._generate_markdown(content)
+ mime_type = "text/markdown"
+ elif output_format == "html":
+ file_path, file_bytes = self._generate_html(content)
+ mime_type = "text/html"
else:
raise Exception(f"Unsupported output format: {output_format}")
-
+
filename = os.path.basename(file_path)
-
- # Verify the file was created and has content
- if not os.path.exists(file_path):
- raise Exception(f"Document file was not created: {file_path}")
-
- file_size = os.path.getsize(file_path)
- if file_size == 0:
- raise Exception(f"Document file is empty: {file_path}")
-
- print(f"Successfully generated {output_format.upper()}: {file_path} (Size: {file_size} bytes)")
-
- # Set outputs
- self.set_output("file_path", file_path)
- self.set_output("pdf_base64", doc_base64) # Keep same output name for compatibility
- self.set_output("success", True)
-
- # Create download info object
+ if not file_bytes:
+ raise Exception("Document file is empty")
+
+ file_size = len(file_bytes)
+ doc_id = get_uuid()
+ settings.STORAGE_IMPL.put(self._canvas.get_tenant_id(), doc_id, file_bytes)
+
+ logging.info(
+ "Successfully generated %s: %s (Size: %s bytes)",
+ output_format.upper(),
+ filename,
+ file_size,
+ )
+
download_info = {
+ "doc_id": doc_id,
"filename": filename,
- "path": file_path,
- "base64": doc_base64,
"mime_type": mime_type,
- "size": file_size
+ "size": file_size,
}
- # Output download info as JSON string so it can be used in Message block
- download_json = json.dumps(download_info)
- self.set_output("download", download_json)
-
+ self.set_output("download", json.dumps(download_info))
return download_info
-
+
except Exception as e:
- error_msg = f"Error in _generate_pdf: {str(e)}\n{traceback.format_exc()}"
- print(error_msg)
- self.set_output("success", False)
- self.set_output("_ERROR", f"PDF generation failed: {str(e)}")
+ logging.exception("Error generating %s document", output_format)
+ self.set_output("_ERROR", f"Document generation failed: {str(e)}")
raise
-
+
except Exception as e:
- error_msg = f"Error in PDFGenerator._invoke: {str(e)}\n{traceback.format_exc()}"
- print(error_msg)
- self.set_output("success", False)
- self.set_output("_ERROR", f"PDF generation failed: {str(e)}")
+ logging.exception("Error in DocGenerator._invoke")
+ self.set_output("_ERROR", f"Document generation failed: {str(e)}")
raise
-
- def _generate_pdf(self, content: str, title: str = "", subtitle: str = "") -> tuple[str, str]:
- """Generate PDF from markdown-style content with improved error handling and concurrency support"""
- import uuid
- import traceback
-
- # Create output directory if it doesn't exist
- os.makedirs(self._param.output_directory, exist_ok=True)
-
- # Initialize variables that need cleanup
- buffer = None
- temp_file_path = None
- file_path = None
-
- try:
- # Generate a unique filename to prevent conflicts
- if self._param.filename:
- base_name = os.path.splitext(self._param.filename)[0]
- filename = f"{base_name}_{uuid.uuid4().hex[:8]}.pdf"
- else:
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- filename = f"document_{timestamp}_{uuid.uuid4().hex[:8]}.pdf"
-
- file_path = os.path.join(self._param.output_directory, filename)
- temp_file_path = f"{file_path}.tmp"
-
- # Setup page size
- page_size = A4
- if self._param.orientation == "landscape":
- page_size = (A4[1], A4[0])
-
- # Create PDF buffer and document
- buffer = BytesIO()
- doc = SimpleDocTemplate(
- buffer,
- pagesize=page_size,
- topMargin=self._param.margin_top * inch,
- bottomMargin=self._param.margin_bottom * inch,
- leftMargin=self._param.margin_left * inch,
- rightMargin=self._param.margin_right * inch
- )
-
- # Build story (content elements)
- story = []
- # Combine all text content for Unicode font detection
- all_text = f"{title} {subtitle} {content}"
-
- # IMPORTANT: Register Unicode fonts BEFORE creating any styles or Paragraphs
- # This ensures the font family is available for ReportLab's HTML parser
- if self._needs_unicode_font(all_text):
- self._register_unicode_fonts()
-
- styles = self._create_styles(all_text)
-
- # Add logo if provided
- if self._param.logo_image:
- logo = self._add_logo()
- if logo:
- story.append(logo)
- story.append(Spacer(1, 0.3 * inch))
-
- # Add title
- if title:
- title_para = Paragraph(self._escape_html(title), styles['PDFTitle'])
- story.append(title_para)
- story.append(Spacer(1, 0.2 * inch))
-
- # Add subtitle
- if subtitle:
- subtitle_para = Paragraph(self._escape_html(subtitle), styles['PDFSubtitle'])
- story.append(subtitle_para)
- story.append(Spacer(1, 0.3 * inch))
-
- # Add timestamp if enabled
- if self._param.add_timestamp:
- timestamp_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
- timestamp_para = Paragraph(timestamp_text, styles['Italic'])
- story.append(timestamp_para)
- story.append(Spacer(1, 0.2 * inch))
-
- # Parse and add content
- content_elements = self._parse_markdown_content(content, styles)
- story.extend(content_elements)
-
- # Build PDF
- doc.build(story, onFirstPage=self._add_page_decorations, onLaterPages=self._add_page_decorations)
-
- # Get PDF bytes
- pdf_bytes = buffer.getvalue()
-
- # Write to temporary file first
- with open(temp_file_path, 'wb') as f:
- f.write(pdf_bytes)
-
- # Atomic rename to final filename (works across different filesystems)
- if os.path.exists(file_path):
- os.remove(file_path)
- os.rename(temp_file_path, file_path)
-
- # Verify the file was created and has content
- if not os.path.exists(file_path):
- raise Exception(f"Failed to create output file: {file_path}")
-
- file_size = os.path.getsize(file_path)
- if file_size == 0:
- raise Exception(f"Generated PDF is empty: {file_path}")
-
- # Convert to base64
- pdf_base64 = base64.b64encode(pdf_bytes).decode('utf-8')
-
- return file_path, pdf_base64
-
- except Exception as e:
- # Clean up any temporary files on error
- if temp_file_path and os.path.exists(temp_file_path):
- try:
- os.remove(temp_file_path)
- except Exception as cleanup_error:
- print(f"Error cleaning up temporary file: {cleanup_error}")
-
- error_msg = f"Error generating PDF: {str(e)}\n{traceback.format_exc()}"
- print(error_msg)
- raise Exception(f"PDF generation failed: {str(e)}")
-
finally:
- # Ensure buffer is always closed
- if buffer is not None:
+ if file_path and os.path.exists(file_path):
+ os.remove(file_path)
+
+ def _resolve_content(self, kwargs: dict) -> str:
+ content = self._param.content or ""
+ logging.info("Starting document generation, content length: %s chars", len(content))
+
+ if content and self._canvas.is_reff(content.strip()):
+ matches = re.findall(self.variable_ref_patt, content, flags=re.DOTALL)
+ for match in matches:
try:
- buffer.close()
- except Exception as close_error:
- print(f"Error closing buffer: {close_error}")
-
- def _create_styles(self, content: str = ""):
- """Create custom paragraph styles with Unicode font support if needed"""
- # Check if content contains CJK characters that need special fonts
- needs_cjk = self._needs_unicode_font(content)
-
- if needs_cjk:
- # Use CID fonts for CJK content
- if self._register_unicode_fonts() and self._unicode_font_name:
- regular_font = self._unicode_font_name
- bold_font = self._unicode_font_bold_name or self._unicode_font_name
- print(f"Using CID font for CJK content: {regular_font}")
- else:
- # Fall back to configured font if CID fonts unavailable
- regular_font = self._param.font_family
- bold_font = self._get_bold_font_name()
- print(f"Warning: CJK content detected but no CID font available, using {regular_font}")
- else:
- # Use user-selected font for Latin-only content
- regular_font = self._param.font_family
- bold_font = self._get_bold_font_name()
- print(f"Using configured font: {regular_font}")
-
- # Store active fonts as instance variables for use in other methods
- self._active_font = regular_font
- self._active_bold_font = bold_font
-
- # Get fresh style sheet
- styles = getSampleStyleSheet()
-
- # Helper function to get the correct bold font name
- def get_bold_font(font_family):
- """Get the correct bold variant of a font family"""
- # If using Unicode font, return the Unicode bold
- if font_family in ('UnicodeFont', self._unicode_font_name):
- return bold_font
- font_map = {
- 'Helvetica': 'Helvetica-Bold',
- 'Times-Roman': 'Times-Bold',
- 'Courier': 'Courier-Bold',
- }
- if 'Bold' in font_family:
- return font_family
- return font_map.get(font_family, 'Helvetica-Bold')
-
- # Use detected font instead of configured font for non-Latin content
- active_font = regular_font
- active_bold_font = bold_font
-
- # Helper function to add or update style
- def add_or_update_style(name, **kwargs):
- if name in styles:
- # Update existing style
- style = styles[name]
- for key, value in kwargs.items():
- setattr(style, key, value)
- else:
- # Add new style
- styles.add(ParagraphStyle(name=name, **kwargs))
-
- # IMPORTANT: Update base styles to use Unicode font for non-Latin content
- # This ensures ALL text uses the correct font, not just our custom styles
- add_or_update_style('Normal', fontName=active_font)
- add_or_update_style('BodyText', fontName=active_font)
- add_or_update_style('Bullet', fontName=active_font)
- add_or_update_style('Heading1', fontName=active_bold_font)
- add_or_update_style('Heading2', fontName=active_bold_font)
- add_or_update_style('Heading3', fontName=active_bold_font)
- add_or_update_style('Title', fontName=active_bold_font)
-
- # Title style
- add_or_update_style(
- 'PDFTitle',
- parent=styles['Heading1'],
- fontSize=self._param.title_font_size,
- textColor=colors.HexColor(self._param.title_color),
- fontName=active_bold_font,
- alignment=TA_CENTER,
- spaceAfter=12
- )
-
- # Subtitle style
- add_or_update_style(
- 'PDFSubtitle',
- parent=styles['Heading2'],
- fontSize=self._param.heading2_font_size,
- textColor=colors.HexColor(self._param.text_color),
- fontName=active_font,
- alignment=TA_CENTER,
- spaceAfter=12
- )
-
- # Custom heading styles
- add_or_update_style(
- 'CustomHeading1',
- parent=styles['Heading1'],
- fontSize=self._param.heading1_font_size,
- fontName=active_bold_font,
- textColor=colors.HexColor(self._param.text_color),
- spaceAfter=12,
- spaceBefore=12
- )
-
- add_or_update_style(
- 'CustomHeading2',
- parent=styles['Heading2'],
- fontSize=self._param.heading2_font_size,
- fontName=active_bold_font,
- textColor=colors.HexColor(self._param.text_color),
- spaceAfter=10,
- spaceBefore=10
- )
-
- add_or_update_style(
- 'CustomHeading3',
- parent=styles['Heading3'],
- fontSize=self._param.heading3_font_size,
- fontName=active_bold_font,
- textColor=colors.HexColor(self._param.text_color),
- spaceAfter=8,
- spaceBefore=8
- )
-
- # Body text style
- add_or_update_style(
- 'CustomBody',
- parent=styles['BodyText'],
- fontSize=self._param.font_size,
- fontName=active_font,
- textColor=colors.HexColor(self._param.text_color),
- leading=self._param.font_size * self._param.line_spacing,
- alignment=TA_JUSTIFY
- )
-
- # Bullet style
- add_or_update_style(
- 'CustomBullet',
- parent=styles['BodyText'],
- fontSize=self._param.font_size,
- fontName=active_font,
- textColor=colors.HexColor(self._param.text_color),
- leftIndent=20,
- bulletIndent=10
- )
-
- # Code style (keep Courier for code blocks)
- add_or_update_style(
- 'PDFCode',
- parent=styles.get('Code', styles['Normal']),
- fontSize=self._param.font_size - 1,
- fontName='Courier',
- textColor=colors.HexColor('#333333'),
- backColor=colors.HexColor('#f5f5f5'),
- leftIndent=20,
- rightIndent=20
- )
-
- # Italic style
- add_or_update_style(
- 'Italic',
- parent=styles['Normal'],
- fontSize=self._param.font_size,
- fontName=active_font,
- textColor=colors.HexColor(self._param.text_color)
- )
-
- return styles
-
- def _parse_markdown_content(self, content: str, styles):
- """Parse markdown-style content and convert to PDF elements"""
- elements = []
- lines = content.split('\n')
-
- i = 0
- while i < len(lines):
- line = lines[i].strip()
-
- # Skip empty lines
- if not line:
- elements.append(Spacer(1, 0.1 * inch))
- i += 1
- continue
-
- # Horizontal rule
- if line == '---' or line == '___':
- elements.append(Spacer(1, 0.1 * inch))
- elements.append(self._create_horizontal_line())
- elements.append(Spacer(1, 0.1 * inch))
- i += 1
- continue
-
- # Heading 1
- if line.startswith('# ') and not line.startswith('## '):
- text = line[2:].strip()
- elements.append(Paragraph(self._format_inline(text), styles['CustomHeading1']))
- i += 1
- continue
-
- # Heading 2
- if line.startswith('## ') and not line.startswith('### '):
- text = line[3:].strip()
- elements.append(Paragraph(self._format_inline(text), styles['CustomHeading2']))
- i += 1
- continue
-
- # Heading 3
- if line.startswith('### '):
- text = line[4:].strip()
- elements.append(Paragraph(self._format_inline(text), styles['CustomHeading3']))
- i += 1
- continue
-
- # Bullet list
- if line.startswith('- ') or line.startswith('* '):
- bullet_items = []
- while i < len(lines) and (lines[i].strip().startswith('- ') or lines[i].strip().startswith('* ')):
- item_text = lines[i].strip()[2:].strip()
- formatted = self._format_inline(item_text)
- bullet_items.append(f"• {formatted}")
- i += 1
- for item in bullet_items:
- elements.append(Paragraph(item, styles['CustomBullet']))
- continue
-
- # Numbered list
- if re.match(r'^\d+\.\s', line):
- numbered_items = []
- counter = 1
- while i < len(lines) and re.match(r'^\d+\.\s', lines[i].strip()):
- item_text = re.sub(r'^\d+\.\s', '', lines[i].strip())
- numbered_items.append(f"{counter}. {self._format_inline(item_text)}")
- counter += 1
- i += 1
- for item in numbered_items:
- elements.append(Paragraph(item, styles['CustomBullet']))
- continue
-
- # Table detection (markdown table must start with |)
- if line.startswith('|') and '|' in line:
- table_lines = []
- # Collect all consecutive lines that look like table rows
- while i < len(lines) and lines[i].strip() and '|' in lines[i]:
- table_lines.append(lines[i].strip())
- i += 1
-
- # Only process if we have at least 2 lines (header + separator or header + data)
- if len(table_lines) >= 2:
- table_elements = self._create_table(table_lines)
- if table_elements:
- # _create_table now returns a list of elements
- elements.extend(table_elements)
- elements.append(Spacer(1, 0.2 * inch))
- continue
- else:
- # Not a valid table, treat as regular text
- i -= len(table_lines) # Reset position
-
- # Code block
- if line.startswith('```'):
- code_lines = []
- i += 1
- while i < len(lines) and not lines[i].strip().startswith('```'):
- code_lines.append(lines[i])
- i += 1
- if i < len(lines):
- i += 1
- code_text = '\n'.join(code_lines)
- elements.append(Paragraph(self._escape_html(code_text), styles['PDFCode']))
- elements.append(Spacer(1, 0.1 * inch))
- continue
-
- # Regular paragraph
- paragraph_lines = [line]
- i += 1
- while i < len(lines) and lines[i].strip() and not self._is_special_line(lines[i]):
- paragraph_lines.append(lines[i].strip())
- i += 1
-
- paragraph_text = ' '.join(paragraph_lines)
- formatted_text = self._format_inline(paragraph_text)
- elements.append(Paragraph(formatted_text, styles['CustomBody']))
- elements.append(Spacer(1, 0.1 * inch))
-
- return elements
-
- def _is_special_line(self, line: str) -> bool:
- """Check if line is a special markdown element"""
- line = line.strip()
- return (line.startswith('#') or
- line.startswith('- ') or
- line.startswith('* ') or
- re.match(r'^\d+\.\s', line) or
- line in ['---', '___'] or
- line.startswith('```') or
- '|' in line)
-
- def _format_inline(self, text: str) -> str:
- """Format inline markdown (bold, italic, code)"""
- # First, escape the existing HTML to not conflict with our tags.
- text = self._escape_html(text)
-
- # IMPORTANT: Process inline code FIRST to protect underscores inside code blocks
- # Use a placeholder to protect code blocks from italic/bold processing
- code_blocks = []
- def save_code(match):
- code_blocks.append(match.group(1))
- return f"__CODE_BLOCK_{len(code_blocks)-1}__"
-
- text = re.sub(r'`(.+?)`', save_code, text)
-
- # Then, apply markdown formatting.
- # The order is important: from most specific to least specific.
-
- # Bold and italic combined: ***text*** or ___text___
- text = re.sub(r'\*\*\*(.+?)\*\*\*', r'\1', text)
- text = re.sub(r'___(.+?)___', r'\1', text)
-
- # Bold: **text** or __text__
- text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
- text = re.sub(r'__([^_]+?)__', r'\1', text) # More restrictive to avoid matching placeholders
-
- # Italic: *text* or _text_ (but not underscores in words like variable_name)
- text = re.sub(r'\*([^*]+?)\*', r'\1', text)
- # Only match _text_ when surrounded by spaces or at start/end, not mid-word underscores
- text = re.sub(r'(?\1', text)
-
- # Restore code blocks with proper formatting
- for i, code in enumerate(code_blocks):
- text = text.replace(f"__CODE_BLOCK_{i}__", f'{code}')
-
- return text
-
- def _escape_html(self, text: str) -> str:
- """Escape HTML special characters and clean up markdown.
-
- Args:
- text: Input text that may contain HTML or markdown
-
- Returns:
- str: Cleaned and escaped text
- """
- if not text:
- return ""
-
- # Ensure we're working with a string
- text = str(text)
-
- # Remove HTML form elements and tags
- text = re.sub(r']*>', '', text, flags=re.IGNORECASE) # Remove input tags
- text = re.sub(r'', '', text, flags=re.IGNORECASE | re.DOTALL) # Remove textarea
- text = re.sub(r'', '', text, flags=re.IGNORECASE | re.DOTALL) # Remove select
- text = re.sub(r'', '', text, flags=re.IGNORECASE | re.DOTALL) # Remove buttons
- text = re.sub(r'
', '', text, flags=re.IGNORECASE | re.DOTALL) # Remove forms
-
- # Remove other common HTML tags (but preserve content)
- text = re.sub(r']*>', '', text, flags=re.IGNORECASE)
- text = re.sub(r'
', '', text, flags=re.IGNORECASE)
- text = re.sub(r']*>', '', text, flags=re.IGNORECASE)
- text = re.sub(r'', '', text, flags=re.IGNORECASE)
- text = re.sub(r']*>', '', text, flags=re.IGNORECASE)
- text = re.sub(r'
', '\n', text, flags=re.IGNORECASE)
-
- # First, handle common markdown table artifacts
- text = re.sub(r'^[|\-\s:]+$', '', text, flags=re.MULTILINE) # Remove separator lines
- text = re.sub(r'^\s*\|\s*|\s*\|\s*$', '', text) # Remove leading/trailing pipes
- text = re.sub(r'\s*\|\s*', ' | ', text) # Normalize pipes
-
- # Remove markdown links, but keep other formatting characters for _format_inline
- text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text) # Remove markdown links
-
- # Escape HTML special characters
- text = text.replace('&', '&')
- text = text.replace('<', '<')
- text = text.replace('>', '>')
-
- # Clean up excessive whitespace
- text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text) # Multiple blank lines to double
- text = re.sub(r' +', ' ', text) # Multiple spaces to single
-
- return text.strip()
-
- def _get_cell_style(self, row_idx: int, is_header: bool = False, font_size: int = None) -> 'ParagraphStyle':
- """Get the appropriate style for a table cell."""
- styles = getSampleStyleSheet()
-
- # Helper function to get the correct bold font name
- def get_bold_font(font_family):
- font_map = {
- 'Helvetica': 'Helvetica-Bold',
- 'Times-Roman': 'Times-Bold',
- 'Courier': 'Courier-Bold',
- }
- if 'Bold' in font_family:
- return font_family
- return font_map.get(font_family, 'Helvetica-Bold')
-
- if is_header:
- return ParagraphStyle(
- 'TableHeader',
- parent=styles['Normal'],
- fontSize=self._param.font_size,
- fontName=self._get_active_bold_font(),
- textColor=colors.whitesmoke,
- alignment=TA_CENTER,
- leading=self._param.font_size * 1.2,
- wordWrap='CJK'
- )
- else:
- font_size = font_size or (self._param.font_size - 1)
- return ParagraphStyle(
- 'TableCell',
- parent=styles['Normal'],
- fontSize=font_size,
- fontName=self._get_active_font(),
- textColor=colors.black,
- alignment=TA_LEFT,
- leading=font_size * 1.15,
- wordWrap='CJK'
- )
-
- def _convert_table_to_definition_list(self, data: list[list[str]]) -> list:
- """Convert a table to a definition list format for better handling of large content.
-
- This method handles both simple and complex tables, including those with nested content.
- It ensures that large cell content is properly wrapped and paginated.
- """
- elements = []
- styles = getSampleStyleSheet()
-
- # Base styles
- base_font_size = getattr(self._param, 'font_size', 10)
-
- # Body style
- body_style = ParagraphStyle(
- 'TableBody',
- parent=styles['Normal'],
- fontSize=base_font_size,
- fontName=self._get_active_font(),
- textColor=colors.HexColor(getattr(self._param, 'text_color', '#000000')),
- spaceAfter=6,
- leading=base_font_size * 1.2
- )
-
- # Label style (for field names)
- label_style = ParagraphStyle(
- 'LabelStyle',
- parent=body_style,
- fontName=self._get_active_bold_font(),
- textColor=colors.HexColor('#2c3e50'),
- fontSize=base_font_size,
- spaceAfter=4,
- leftIndent=0,
- leading=base_font_size * 1.3
- )
-
- # Value style (for cell content) - clean, no borders
- value_style = ParagraphStyle(
- 'ValueStyle',
- parent=body_style,
- leftIndent=15,
- rightIndent=0,
- spaceAfter=8,
- spaceBefore=2,
- fontSize=base_font_size,
- textColor=colors.HexColor('#333333'),
- alignment=TA_JUSTIFY,
- leading=base_font_size * 1.4,
- # No borders or background - clean text only
- )
-
- try:
- # If we have no data, return empty list
- if not data or not any(data):
- return elements
-
- # Get column headers or generate them
- headers = []
- if data and len(data) > 0:
- headers = [str(h).strip() for h in data[0]]
-
- # If no headers or empty headers, generate them
- if not any(headers):
- headers = [f"Column {i+1}" for i in range(len(data[0]) if data and len(data) > 0 else 0)]
-
- # Process each data row (skip header if it exists)
- start_row = 1 if len(data) > 1 and any(data[0]) else 0
-
- for row_idx in range(start_row, len(data)):
- row = data[row_idx] if row_idx < len(data) else []
- if not row:
- continue
-
- # Create a container for the row
- row_elements = []
-
- # Process each cell in the row
- for col_idx in range(len(headers)):
- if col_idx >= len(headers):
+ var_value = self._canvas.get_variable_value(match)
+ if var_value is None:
continue
-
- # Get cell content
- cell_text = str(row[col_idx]).strip() if col_idx < len(row) and row[col_idx] is not None else ""
-
- # Skip empty cells
- if not cell_text or cell_text.isspace():
- continue
-
- # Clean up markdown artifacts for regular text content
- cell_text = str(cell_text) # Ensure it's a string
-
- # Remove markdown table formatting
- cell_text = re.sub(r'^[|\-\s:]+$', '', cell_text, flags=re.MULTILINE) # Remove separator lines
- cell_text = re.sub(r'^\s*\|\s*|\s*\|\s*$', '', cell_text) # Remove leading/trailing pipes
- cell_text = re.sub(r'\s*\|\s*', ' | ', cell_text) # Normalize pipes
- cell_text = re.sub(r'\s+', ' ', cell_text).strip() # Normalize whitespace
-
- # Remove any remaining markdown formatting
- cell_text = re.sub(r'`(.*?)`', r'\1', cell_text) # Remove code ticks
- cell_text = re.sub(r'\*\*(.*?)\*\*', r'\1', cell_text) # Remove bold
- cell_text = re.sub(r'\*(.*?)\*', r'\1', cell_text) # Remove italic
-
- # Clean up any HTML entities or special characters
- cell_text = self._escape_html(cell_text)
-
- # If content still looks like a table, convert it to plain text
- if '|' in cell_text and ('--' in cell_text or any(cell_text.count('|') > 2 for line in cell_text.split('\n') if line.strip())):
- # Convert to a simple text format
- lines = [line.strip() for line in cell_text.split('\n') if line.strip()]
- cell_text = ' | '.join(lines[:5]) # Join first 5 lines with pipe
- if len(lines) > 5:
- cell_text += '...'
-
- # Process long content with better wrapping
- max_chars_per_line = 100 # Reduced for better readability
- max_paragraphs = 3 # Maximum number of paragraphs to show initially
-
- # Split into paragraphs
- paragraphs = [p for p in cell_text.split('\n\n') if p.strip()]
-
- # If content is too long, truncate with "show more" indicator
- if len(paragraphs) > max_paragraphs or any(len(p) > max_chars_per_line * 3 for p in paragraphs):
- wrapped_paragraphs = []
-
- for i, para in enumerate(paragraphs[:max_paragraphs]):
- if len(para) > max_chars_per_line * 3:
- # Split long paragraphs
- words = para.split()
- current_line = []
- current_length = 0
-
- for word in words:
- if current_line and current_length + len(word) + 1 > max_chars_per_line:
- wrapped_paragraphs.append(' '.join(current_line))
- current_line = [word]
- current_length = len(word)
- else:
- current_line.append(word)
- current_length += len(word) + (1 if current_line else 0)
-
- if current_line:
- wrapped_paragraphs.append(' '.join(current_line))
- else:
- wrapped_paragraphs.append(para)
-
- # Add "show more" indicator if there are more paragraphs
- if len(paragraphs) > max_paragraphs:
- wrapped_paragraphs.append(f"... and {len(paragraphs) - max_paragraphs} more paragraphs")
-
- cell_text = '\n\n'.join(wrapped_paragraphs)
-
- # Add label and content with clean formatting (no borders)
- label_para = Paragraph(f"{self._escape_html(headers[col_idx])}:", label_style)
- value_para = Paragraph(self._escape_html(cell_text), value_style)
-
- # Add elements with proper spacing
- row_elements.append(label_para)
- row_elements.append(Spacer(1, 0.03 * 72)) # Tiny space between label and value
- row_elements.append(value_para)
-
- # Add spacing between rows
- if row_elements and row_idx < len(data) - 1:
- # Add a subtle horizontal line as separator
- row_elements.append(Spacer(1, 0.1 * 72))
- row_elements.append(self._create_horizontal_line(width=0.5, color='#e0e0e0'))
- row_elements.append(Spacer(1, 0.15 * 72))
-
- elements.extend(row_elements)
-
- # Add some space after the table
- if elements:
- elements.append(Spacer(1, 0.3 * 72)) # 0.3 inches in points
-
- except Exception as e:
- # Fallback to simple text representation if something goes wrong
- error_style = ParagraphStyle(
- 'ErrorStyle',
- parent=styles['Normal'],
- fontSize=base_font_size - 1,
- textColor=colors.red,
- backColor=colors.HexColor('#fff0f0'),
- borderWidth=1,
- borderColor=colors.red,
- borderPadding=5
- )
-
- error_msg = [
- Paragraph("Error processing table:", error_style),
- Paragraph(str(e), error_style),
- Spacer(1, 0.2 * 72)
- ]
-
- # Add a simplified version of the table
- try:
- for row in data[:10]: # Limit to first 10 rows to avoid huge error output
- error_msg.append(Paragraph(" | ".join(str(cell) for cell in row), body_style))
- if len(data) > 10:
- error_msg.append(Paragraph(f"... and {len(data) - 10} more rows", body_style))
- except Exception:
- pass
-
- elements.extend(error_msg)
-
- return elements
-
- def _create_table(self, table_lines: list[str]) -> Optional[list]:
- """Create a table from markdown table syntax with robust error handling.
-
- This method handles simple tables and falls back to a list format for complex cases.
-
- Returns:
- A list of flowables (could be a table or alternative representation)
- Returns None if the table cannot be created.
- """
- if not table_lines or len(table_lines) < 2:
- return None
-
- try:
- # Parse table data
- data = []
- max_columns = 0
-
- for line in table_lines:
- # Skip separator lines (e.g., |---|---|)
- if re.match(r'^\|[\s\-:]+\|$', line):
- continue
-
- # Handle empty lines within tables
- if not line.strip():
- continue
-
- # Split by | and clean up cells
- cells = []
- in_quotes = False
- current_cell = ""
-
- # Custom split to handle escaped pipes and quoted content
- for char in line[1:]: # Skip initial |
- if char == '|' and not in_quotes:
- cells.append(current_cell.strip())
- current_cell = ""
- elif char == '"':
- in_quotes = not in_quotes
- current_cell += char
- elif char == '\\' and not in_quotes:
- # Handle escaped characters
- pass
+ if isinstance(var_value, partial):
+ resolved_content = ""
+ for chunk in var_value():
+ resolved_content += chunk
+ content = content.replace("{" + match + "}", resolved_content)
else:
- current_cell += char
-
- # Add the last cell
- if current_cell.strip() or len(cells) > 0:
- cells.append(current_cell.strip())
-
- # Remove empty first/last elements if they're empty (from leading/trailing |)
- if cells and not cells[0]:
- cells = cells[1:]
- if cells and not cells[-1]:
- cells = cells[:-1]
-
- if cells:
- data.append(cells)
- max_columns = max(max_columns, len(cells))
-
- if not data or max_columns == 0:
- return None
-
- # Ensure all rows have the same number of columns
- for row in data:
- while len(row) < max_columns:
- row.append('')
-
- # Calculate available width for table
- from reportlab.lib.pagesizes import A4
- page_width = A4[0] if self._param.orientation == 'portrait' else A4[1]
- available_width = page_width - (self._param.margin_left + self._param.margin_right) * inch
-
- # Check if we should use definition list format
- max_cell_length = max((len(str(cell)) for row in data for cell in row), default=0)
- total_rows = len(data)
-
- # Use definition list format if:
- # - Any cell is too large (> 300 chars), OR
- # - More than 6 columns, OR
- # - More than 20 rows, OR
- # - Contains nested tables or complex structures
- has_nested_tables = any('|' in cell and '---' in cell for row in data for cell in row)
- has_complex_cells = any(len(str(cell)) > 150 for row in data for cell in row)
-
- should_use_list_format = (
- max_cell_length > 300 or
- max_columns > 6 or
- total_rows > 20 or
- has_nested_tables or
- has_complex_cells
- )
-
- if should_use_list_format:
- return self._convert_table_to_definition_list(data)
-
- # Process cells for normal table
- processed_data = []
- for row_idx, row in enumerate(data):
- processed_row = []
- for cell_idx, cell in enumerate(row):
- cell_text = str(cell).strip() if cell is not None else ""
-
- # Handle empty cells
- if not cell_text:
- processed_row.append("")
- continue
-
- # Clean up markdown table artifacts
- cell_text = re.sub(r'\\\|', '|', cell_text) # Unescape pipes
- cell_text = re.sub(r'\\n', '\n', cell_text) # Handle explicit newlines
-
- # Check for nested tables
- if '|' in cell_text and '---' in cell_text:
- # This cell contains a nested table
- nested_lines = [line.strip() for line in cell_text.split('\n') if line.strip()]
- nested_table = self._create_table(nested_lines)
- if nested_table:
- processed_row.append(nested_table[0]) # Add the nested table
- continue
-
- # Process as regular text
- font_size = self._param.font_size - 1 if row_idx > 0 else self._param.font_size
- try:
- style = self._get_cell_style(row_idx, is_header=(row_idx == 0), font_size=font_size)
- escaped_text = self._escape_html(cell_text)
- processed_row.append(Paragraph(escaped_text, style))
- except Exception:
- processed_row.append(self._escape_html(cell_text))
-
- processed_data.append(processed_row)
-
- # Calculate column widths
- min_col_width = 0.5 * inch
- max_cols = int(available_width / min_col_width)
-
- if max_columns > max_cols:
- return self._convert_table_to_definition_list(data)
-
- col_width = max(min_col_width, available_width / max_columns)
- col_widths = [col_width] * max_columns
-
- # Create the table
- try:
- table = LongTable(processed_data, colWidths=col_widths, repeatRows=1)
-
- # Define table style
- table_style = [
- ('BACKGROUND', (0, 0), (-1, 0), colors.HexColor('#2c3e50')), # Darker header
- ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
- ('ALIGN', (0, 0), (-1, 0), 'CENTER'),
- ('FONTNAME', (0, 0), (-1, 0), self._get_active_bold_font()),
- ('FONTSIZE', (0, 0), (-1, -1), self._param.font_size - 1),
- ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
- ('BACKGROUND', (0, 1), (-1, -1), colors.HexColor('#f8f9fa')), # Lighter background
- ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#dee2e6')), # Lighter grid
- ('VALIGN', (0, 0), (-1, -1), 'TOP'),
- ('TOPPADDING', (0, 0), (-1, -1), 8),
- ('BOTTOMPADDING', (0, 0), (-1, -1), 8),
- ('LEFTPADDING', (0, 0), (-1, -1), 8),
- ('RIGHTPADDING', (0, 0), (-1, -1), 8),
- ]
-
- # Add zebra striping for better readability
- for i in range(1, len(processed_data)):
- if i % 2 == 0:
- table_style.append(('BACKGROUND', (0, i), (-1, i), colors.HexColor('#f1f3f5')))
-
- table.setStyle(TableStyle(table_style))
-
- # Add a small spacer after the table
- return [table, Spacer(1, 0.2 * inch)]
-
- except Exception as table_error:
- print(f"Error creating table: {table_error}")
- return self._convert_table_to_definition_list(data)
-
- except Exception as e:
- print(f"Error processing table: {e}")
- # Return a simple text representation of the table
- try:
- text_content = []
- for row in data:
- text_content.append(" | ".join(str(cell) for cell in row))
- return [Paragraph("
".join(text_content), self._get_cell_style(0))]
- except Exception:
- return None
+ content = content.replace("{" + match + "}", str(var_value))
+ except Exception as e:
+ logging.warning("Error resolving variable %s: %s", match, str(e))
+ content = content.replace("{" + match + "}", f"[ERROR: {str(e)}]")
- def _create_horizontal_line(self, width: float = 1, color: str = None):
- """Create a horizontal line with customizable width and color
-
- Args:
- width: Line thickness in points (default: 1)
- color: Hex color string (default: grey)
-
- Returns:
- HRFlowable: Horizontal line element
- """
- from reportlab.platypus import HRFlowable
- line_color = colors.HexColor(color) if color else colors.grey
- return HRFlowable(width="100%", thickness=width, color=line_color, spaceBefore=0, spaceAfter=0)
+ if content:
+ try:
+ content, _ = self.get_kwargs(content, kwargs)
+ except Exception as e:
+ logging.warning("Error processing content with get_kwargs: %s", str(e))
+
+ if not content:
+ content = kwargs.get("content", "")
+
+ return content
+
+ def _get_output_directory(self) -> str:
+ os.makedirs(self._default_output_directory, exist_ok=True)
+ return self._default_output_directory
+
+ def _build_output_filename(self, output_format: str) -> str:
+ import uuid
+
+ if self._param.filename:
+ return sanitize_filename(self._param.filename, output_format.lower())
+
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ return f"document_{timestamp}_{uuid.uuid4().hex[:8]}.{output_format}"
+
+ def _get_timestamp_text(self) -> str:
+ return f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
+
+ def _write_bytes_output(self, content: bytes, extension: str) -> tuple[str, bytes]:
+ output_directory = self._get_output_directory()
+ filename = self._build_output_filename(extension)
+ file_path = os.path.join(output_directory, filename)
+ with open(file_path, "wb") as f:
+ f.write(content)
+ return file_path, content
+
+ def _build_markdown_source(self, content: str, include_timestamp_in_body: bool = False) -> str:
+ if not (include_timestamp_in_body and self._param.add_timestamp):
+ return content
+ return f"{self._get_timestamp_text()}\n\n{content}"
+
+ def _get_heading_sizes(self) -> tuple[int, int, int]:
+ base = int(self._param.font_size)
+ return base + 6, base + 4, base + 2
+
+ def _generate_pandoc_binary_output(
+ self,
+ content: str,
+ target_format: str,
+ extension: str,
+ include_timestamp_in_body: bool = False,
+ extra_args: list[str] | None = None,
+ ) -> tuple[str, bytes]:
+ import pypandoc
+
+ output_directory = self._get_output_directory()
+ filename = self._build_output_filename(extension)
+ file_path = os.path.join(output_directory, filename)
+ markdown_content = self._build_markdown_source(
+ content,
+ include_timestamp_in_body=include_timestamp_in_body,
+ )
+
+ pypandoc.convert_text(
+ markdown_content,
+ to=target_format,
+ format="markdown",
+ outputfile=file_path,
+ extra_args=extra_args or [],
+ )
+
+ with open(file_path, "rb") as f:
+ file_bytes = f.read()
+
+ return file_path, file_bytes
+
+ def _generate_pandoc_text_output(
+ self,
+ content: str,
+ target_format: str,
+ extension: str,
+ include_timestamp_in_body: bool = True,
+ ) -> tuple[str, bytes]:
+ import pypandoc
+
+ markdown_content = self._build_markdown_source(
+ content,
+ include_timestamp_in_body=include_timestamp_in_body,
+ )
+ converted_content = pypandoc.convert_text(
+ markdown_content,
+ to=target_format,
+ format="markdown",
+ )
+ return self._write_bytes_output(converted_content.encode("utf-8"), extension)
+
+ def _select_pdf_engine(self) -> str:
+ if shutil.which("xelatex"):
+ return "xelatex"
+ raise Exception("No PDF engine found. Install xelatex.")
+
+ def _get_pdf_font_args(self) -> list[str]:
+ return [
+ "-V",
+ f"mainfont={self._pdf_main_font}",
+ "-V",
+ f"CJKmainfont={self._pdf_cjk_font}",
+ ]
+
+ def _get_pdf_overlay_font_name(self) -> str:
+ from reportlab.pdfbase import pdfmetrics
+ from reportlab.pdfbase.cidfonts import UnicodeCIDFont
- def _add_logo(self) -> Optional[Image]:
- """Add logo image to PDF"""
try:
- # Check if it's base64 or file path
- if self._param.logo_image.startswith('data:image'):
- # Extract base64 data
- base64_data = self._param.logo_image.split(',')[1]
- image_data = base64.b64decode(base64_data)
- img = Image(BytesIO(image_data))
- elif os.path.exists(self._param.logo_image):
- img = Image(self._param.logo_image)
- else:
- return None
-
- # Set size
- img.drawWidth = self._param.logo_width * inch
- img.drawHeight = self._param.logo_height * inch
-
- # Set alignment
- if self._param.logo_position == 'center':
- img.hAlign = 'CENTER'
- elif self._param.logo_position == 'right':
- img.hAlign = 'RIGHT'
- else:
- img.hAlign = 'LEFT'
-
- return img
- except Exception as e:
- print(f"Error adding logo: {e}")
+ pdfmetrics.getFont(self._pdf_overlay_font)
+ except KeyError:
+ pdfmetrics.registerFont(UnicodeCIDFont(self._pdf_overlay_font))
+
+ return self._pdf_overlay_font
+
+ def _build_pdf_heading_overrides(self) -> str:
+ font_size = int(self._param.font_size)
+ leading = round(font_size * 1.2, 1)
+ h1_size, h2_size, h3_size = self._get_heading_sizes()
+ h1_leading = round(h1_size * 1.2, 1)
+ h2_leading = round(h2_size * 1.2, 1)
+ h3_leading = round(h3_size * 1.2, 1)
+
+ return rf"""
+\makeatletter
+\renewcommand\normalsize{{
+ \@setfontsize\normalsize{{{font_size}pt}}{{{leading}pt}}
+ \abovedisplayskip 12pt plus 3pt minus 7pt
+ \abovedisplayshortskip \z@ plus 3pt
+ \belowdisplayshortskip 6.5pt plus 3.5pt minus 3pt
+ \belowdisplayskip \abovedisplayskip
+ \let\@listi\@listI
+}}
+\normalsize
+\renewcommand\section{{\@startsection{{section}}{{1}}{{\z@}}{{-3.5ex \@plus -1ex \@minus -.2ex}}{{2.3ex \@plus .2ex}}{{\normalfont\fontsize{{{h1_size}pt}}{{{h1_leading}pt}}\selectfont\bfseries}}}}
+\renewcommand\subsection{{\@startsection{{subsection}}{{2}}{{\z@}}{{-3.25ex\@plus -1ex \@minus -.2ex}}{{1.5ex \@plus .2ex}}{{\normalfont\fontsize{{{h2_size}pt}}{{{h2_leading}pt}}\selectfont\bfseries}}}}
+\renewcommand\subsubsection{{\@startsection{{subsubsection}}{{3}}{{\z@}}{{-3.25ex\@plus -1ex \@minus -.2ex}}{{1.5ex \@plus .2ex}}{{\normalfont\fontsize{{{h3_size}pt}}{{{h3_leading}pt}}\selectfont\bfseries}}}}
+\makeatother
+""".strip()
+
+ def _write_temp_tex(self, content: str) -> str:
+ output_directory = self._get_output_directory()
+ with tempfile.NamedTemporaryFile(
+ mode="w",
+ encoding="utf-8",
+ suffix=".tex",
+ dir=output_directory,
+ delete=False,
+ ) as f:
+ f.write(content)
+ return f.name
+
+ def _should_apply_pdf_overlay(self) -> bool:
+ return any(
+ [
+ self._param.header_text,
+ self._param.footer_text,
+ self._param.watermark_text,
+ self._param.add_page_numbers,
+ self._param.add_timestamp,
+ ]
+ )
+
+ def _build_pdf_overlay_page(self, width: float, height: float, page_number: int):
+ if not self._should_apply_pdf_overlay():
return None
- def _add_page_decorations(self, canvas, doc):
- """Add header, footer, page numbers, watermark"""
- canvas.saveState()
-
- # Get active font for decorations
- active_font = self._get_active_font()
-
- # Add watermark
+ from pypdf import PdfReader
+ from reportlab.lib.colors import Color
+ from reportlab.pdfgen import canvas as pdf_canvas
+
+ buffer = BytesIO()
+ overlay = pdf_canvas.Canvas(buffer, pagesize=(width, height))
+ overlay_font = self._get_pdf_overlay_font_name()
+
if self._param.watermark_text:
- canvas.setFont(active_font, 60)
- canvas.setFillColorRGB(0.9, 0.9, 0.9, alpha=0.3)
- canvas.saveState()
- canvas.translate(doc.pagesize[0] / 2, doc.pagesize[1] / 2)
- canvas.rotate(45)
- canvas.drawCentredString(0, 0, self._param.watermark_text)
- canvas.restoreState()
-
- # Add header
+ overlay.saveState()
+ if hasattr(overlay, "setFillAlpha"):
+ overlay.setFillAlpha(0.15)
+ overlay.setFillColor(Color(0.6, 0.6, 0.6))
+ overlay.setFont(overlay_font, 48)
+ overlay.translate(width / 2, height / 2)
+ overlay.rotate(45)
+ overlay.drawCentredString(0, 0, self._param.watermark_text)
+ overlay.restoreState()
+
+ overlay.setFont(overlay_font, self._overlay_font_size)
+ overlay.setFillColor(Color(0.35, 0.35, 0.35))
+
if self._param.header_text:
- canvas.setFont(active_font, 9)
- canvas.setFillColorRGB(0.5, 0.5, 0.5)
- canvas.drawString(doc.leftMargin, doc.pagesize[1] - 0.5 * inch, self._param.header_text)
-
- # Add footer
+ overlay.drawString(
+ self._overlay_margin,
+ height - self._overlay_margin + 8,
+ self._param.header_text,
+ )
+
if self._param.footer_text:
- canvas.setFont(active_font, 9)
- canvas.setFillColorRGB(0.5, 0.5, 0.5)
- canvas.drawString(doc.leftMargin, 0.5 * inch, self._param.footer_text)
-
- # Add page numbers
+ overlay.drawString(
+ self._overlay_margin,
+ self._overlay_margin - 8,
+ self._param.footer_text,
+ )
+
+ if self._param.add_timestamp:
+ overlay.drawCentredString(
+ width / 2,
+ self._overlay_margin - 8,
+ self._get_timestamp_text(),
+ )
+
if self._param.add_page_numbers:
- page_num = canvas.getPageNumber()
- text = f"Page {page_num}"
- canvas.setFont(active_font, 9)
- canvas.setFillColorRGB(0.5, 0.5, 0.5)
- canvas.drawRightString(doc.pagesize[0] - doc.rightMargin, 0.5 * inch, text)
-
- canvas.restoreState()
+ overlay.drawRightString(
+ width - self._overlay_margin,
+ self._overlay_margin - 8,
+ f"Page {page_number}",
+ )
+
+ overlay.save()
+ buffer.seek(0)
+ return PdfReader(buffer).pages[0]
+
+ def _apply_pdf_overlay(self, file_path: str) -> tuple[str, bytes]:
+ from pypdf import PdfReader, PdfWriter
+
+ if not self._should_apply_pdf_overlay():
+ with open(file_path, "rb") as f:
+ file_bytes = f.read()
+ return file_path, file_bytes
+
+ reader = PdfReader(file_path)
+ writer = PdfWriter()
+
+ for page_number, page in enumerate(reader.pages, start=1):
+ overlay_page = self._build_pdf_overlay_page(
+ float(page.mediabox.width),
+ float(page.mediabox.height),
+ page_number,
+ )
+ if overlay_page is not None:
+ page.merge_page(overlay_page)
+ writer.add_page(page)
+
+ temp_file = f"{file_path}.overlay"
+ with open(temp_file, "wb") as f:
+ writer.write(f)
+
+ os.replace(temp_file, file_path)
+ with open(file_path, "rb") as f:
+ file_bytes = f.read()
+ return file_path, file_bytes
+
+ def _clear_docx_container(self, container):
+ element = container._element
+ for child in list(element):
+ element.remove(child)
+
+ def _append_docx_field(self, run, instruction: str):
+ from docx.oxml import OxmlElement
+
+ begin = OxmlElement("w:fldChar")
+ begin.set(run.part.element.nsmap["w"] and "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}fldCharType", "begin")
+
+ instr = OxmlElement("w:instrText")
+ instr.set("{http://www.w3.org/XML/1998/namespace}space", "preserve")
+ instr.text = instruction
+
+ end = OxmlElement("w:fldChar")
+ end.set(run.part.element.nsmap["w"] and "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}fldCharType", "end")
+
+ run._r.append(begin)
+ run._r.append(instr)
+ run._r.append(end)
+
+ def _add_docx_watermark(self, section):
+ if not self._param.watermark_text:
+ return
+
+ from docx.enum.text import WD_ALIGN_PARAGRAPH
+ from docx.oxml import parse_xml
+
+ header = section.header
+ paragraph = header.add_paragraph()
+ paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
+ run = paragraph.add_run()
+ watermark_xml = parse_xml(
+ rf"""
+
+
+
+
+
+
+ """
+ )
+ run._r.append(watermark_xml)
+
+ def _normalize_docx_section_geometry(self, section, default_section):
+ for attr in ("page_width", "left_margin", "right_margin"):
+ if getattr(section, attr) is None:
+ setattr(section, attr, getattr(default_section, attr))
+
+ def _get_docx_available_width(self, section):
+ page_width = section.page_width
+ left_margin = section.left_margin
+ right_margin = section.right_margin
+
+ if page_width is None or left_margin is None or right_margin is None:
+ raise ValueError("DOCX section geometry is incomplete after normalization.")
+
+ return page_width - left_margin - right_margin
+
+ def _decorate_docx(self, file_path: str) -> tuple[str, bytes]:
+ from docx import Document
+ from docx.enum.text import WD_TAB_ALIGNMENT
+ from docx.shared import Pt
+
+ document = Document(file_path)
+ default_section = Document().sections[0]
+ h1_size, h2_size, h3_size = self._get_heading_sizes()
+
+ style_map = {
+ "Normal": int(self._param.font_size),
+ "Heading 1": h1_size,
+ "Heading 2": h2_size,
+ "Heading 3": h3_size,
+ }
+ for style_name, size in style_map.items():
+ try:
+ document.styles[style_name].font.size = Pt(size)
+ except Exception:
+ continue
+
+ for section in document.sections:
+ self._normalize_docx_section_geometry(section, default_section)
+ available_width = self._get_docx_available_width(section)
+
+ header = section.header
+ header.is_linked_to_previous = False
+ self._clear_docx_container(header)
+ if self._param.header_text:
+ paragraph = header.add_paragraph()
+ paragraph.add_run(self._param.header_text)
+
+ self._add_docx_watermark(section)
+
+ footer = section.footer
+ footer.is_linked_to_previous = False
+ self._clear_docx_container(footer)
+ if any(
+ [
+ self._param.footer_text,
+ self._param.add_timestamp,
+ self._param.add_page_numbers,
+ ]
+ ):
+ paragraph = footer.add_paragraph()
+ paragraph.paragraph_format.tab_stops.add_tab_stop(
+ int(available_width // 2),
+ WD_TAB_ALIGNMENT.CENTER,
+ )
+ paragraph.paragraph_format.tab_stops.add_tab_stop(
+ int(available_width),
+ WD_TAB_ALIGNMENT.RIGHT,
+ )
+
+ if self._param.footer_text:
+ paragraph.add_run(self._param.footer_text)
+
+ if self._param.add_timestamp or self._param.add_page_numbers:
+ paragraph.add_run("\t")
+
+ if self._param.add_timestamp:
+ paragraph.add_run(self._get_timestamp_text())
+
+ if self._param.add_page_numbers:
+ paragraph.add_run("\t")
+ self._append_docx_field(paragraph.add_run(), " PAGE ")
+
+ document.save(file_path)
+ with open(file_path, "rb") as f:
+ file_bytes = f.read()
+ return file_path, file_bytes
def thoughts(self) -> str:
- return "Generating PDF document with formatted content..."
+ return f"Generating {self._param.output_format.upper()} document with markdown conversion..."
- def _generate_docx(self, content: str, title: str = "", subtitle: str = "") -> tuple[str, str]:
- """Generate DOCX from markdown-style content"""
- import uuid
- from docx import Document
- from docx.shared import Pt
- from docx.enum.text import WD_ALIGN_PARAGRAPH
-
- # Create output directory if it doesn't exist
- os.makedirs(self._param.output_directory, exist_ok=True)
-
+ def _generate_pdf(self, content: str) -> tuple[str, bytes]:
try:
- # Generate filename
- if self._param.filename:
- base_name = os.path.splitext(self._param.filename)[0]
- filename = f"{base_name}_{uuid.uuid4().hex[:8]}.docx"
- else:
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- filename = f"document_{timestamp}_{uuid.uuid4().hex[:8]}.docx"
-
- file_path = os.path.join(self._param.output_directory, filename)
-
- # Create document
- doc = Document()
-
- # Add title
- if title:
- title_para = doc.add_heading(title, level=0)
- title_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
-
- # Add subtitle
- if subtitle:
- subtitle_para = doc.add_heading(subtitle, level=1)
- subtitle_para.alignment = WD_ALIGN_PARAGRAPH.CENTER
-
- # Add timestamp if enabled
- if self._param.add_timestamp:
- timestamp_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
- ts_para = doc.add_paragraph(timestamp_text)
- ts_para.runs[0].italic = True
- ts_para.runs[0].font.size = Pt(9)
-
- # Parse and add content
- lines = content.split('\n')
- i = 0
- while i < len(lines):
- line = lines[i].strip()
-
- if not line:
- i += 1
- continue
-
- # Headings
- if line.startswith('# ') and not line.startswith('## '):
- doc.add_heading(line[2:].strip(), level=1)
- elif line.startswith('## ') and not line.startswith('### '):
- doc.add_heading(line[3:].strip(), level=2)
- elif line.startswith('### '):
- doc.add_heading(line[4:].strip(), level=3)
- # Bullet list
- elif line.startswith('- ') or line.startswith('* '):
- doc.add_paragraph(line[2:].strip(), style='List Bullet')
- # Numbered list
- elif re.match(r'^\d+\.\s', line):
- text = re.sub(r'^\d+\.\s', '', line)
- doc.add_paragraph(text, style='List Number')
- # Regular paragraph
- else:
- para = doc.add_paragraph(line)
- para.runs[0].font.size = Pt(self._param.font_size)
-
- i += 1
-
- # Save document
- doc.save(file_path)
-
- # Read and encode to base64
- with open(file_path, 'rb') as f:
- doc_bytes = f.read()
- doc_base64 = base64.b64encode(doc_bytes).decode('utf-8')
-
- return file_path, doc_base64
-
+ engine = self._select_pdf_engine()
+ header_path = self._write_temp_tex(self._build_pdf_heading_overrides())
+ try:
+ file_path, _ = self._generate_pandoc_binary_output(
+ content,
+ "pdf",
+ "pdf",
+ include_timestamp_in_body=False,
+ extra_args=[
+ "--standalone",
+ f"--pdf-engine={engine}",
+ f"--include-in-header={header_path}",
+ *self._get_pdf_font_args(),
+ ],
+ )
+ finally:
+ if os.path.exists(header_path):
+ os.remove(header_path)
+ return self._apply_pdf_overlay(file_path)
+ except Exception as e:
+ raise Exception(f"PDF generation failed: {str(e)}")
+
+ def _generate_docx(self, content: str) -> tuple[str, bytes]:
+ try:
+ file_path, _ = self._generate_pandoc_binary_output(
+ content,
+ "docx",
+ "docx",
+ include_timestamp_in_body=False,
+ extra_args=["--standalone"],
+ )
+ return self._decorate_docx(file_path)
except Exception as e:
raise Exception(f"DOCX generation failed: {str(e)}")
- def _generate_txt(self, content: str, title: str = "", subtitle: str = "") -> tuple[str, str]:
- """Generate TXT from markdown-style content"""
- import uuid
-
- # Create output directory if it doesn't exist
- os.makedirs(self._param.output_directory, exist_ok=True)
-
+ def _generate_txt(self, content: str) -> tuple[str, bytes]:
try:
- # Generate filename
- if self._param.filename:
- base_name = os.path.splitext(self._param.filename)[0]
- filename = f"{base_name}_{uuid.uuid4().hex[:8]}.txt"
- else:
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
- filename = f"document_{timestamp}_{uuid.uuid4().hex[:8]}.txt"
-
- file_path = os.path.join(self._param.output_directory, filename)
-
- # Build text content
- text_content = []
-
- if title:
- text_content.append(title.upper())
- text_content.append("=" * len(title))
- text_content.append("")
-
- if subtitle:
- text_content.append(subtitle)
- text_content.append("-" * len(subtitle))
- text_content.append("")
-
- if self._param.add_timestamp:
- timestamp_text = f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"
- text_content.append(timestamp_text)
- text_content.append("")
-
- # Add content (keep markdown formatting for readability)
- text_content.append(content)
-
- # Join and save
- final_text = '\n'.join(text_content)
-
- with open(file_path, 'w', encoding='utf-8') as f:
- f.write(final_text)
-
- # Encode to base64
- txt_base64 = base64.b64encode(final_text.encode('utf-8')).decode('utf-8')
-
- return file_path, txt_base64
-
+ return self._generate_pandoc_text_output(content, "plain", "txt")
except Exception as e:
raise Exception(f"TXT generation failed: {str(e)}")
+
+ def _generate_markdown(self, content: str) -> tuple[str, bytes]:
+ try:
+ return self._generate_pandoc_text_output(content, "markdown", "md")
+ except Exception as e:
+ raise Exception(f"Markdown generation failed: {str(e)}")
+
+ def _generate_html(self, content: str) -> tuple[str, bytes]:
+ try:
+ return self._generate_pandoc_text_output(content, "html", "html")
+ except Exception as e:
+ raise Exception(f"HTML generation failed: {str(e)}")
diff --git a/agent/component/message.py b/agent/component/message.py
index cc26ca52ba..8db4eedbd1 100644
--- a/agent/component/message.py
+++ b/agent/component/message.py
@@ -54,6 +54,9 @@ class MessageParam(ComponentParamBase):
self.outputs = {
"content": {
"type": "str"
+ },
+ "downloads": {
+ "type": "list"
}
}
@@ -66,10 +69,66 @@ class MessageParam(ComponentParamBase):
class Message(ComponentBase):
component_name = "Message"
+ @staticmethod
+ def _is_download_info(value: Any) -> bool:
+ return isinstance(value, dict) and all(
+ key in value for key in ("doc_id", "filename", "mime_type")
+ )
+
+ def _extract_downloads(self, value: Any) -> list[dict[str, Any]]:
+ if isinstance(value, str):
+ try:
+ value = json.loads(value)
+ except Exception:
+ return []
+
+ if self._is_download_info(value):
+ return [value]
+
+ if isinstance(value, list) and all(self._is_download_info(item) for item in value):
+ return value
+
+ return []
+
+ def _stringify_message_value(
+ self,
+ value: Any,
+ delimiter: str = None,
+ downloads: list[dict[str, Any]] | None = None,
+ fallback_to_str: bool = False,
+ ) -> str:
+ extracted_downloads = self._extract_downloads(value)
+ if extracted_downloads:
+ if downloads is not None:
+ downloads.extend(extracted_downloads)
+ return ""
+
+ if value is None:
+ return ""
+
+ if isinstance(value, list) and delimiter:
+ return delimiter.join([str(vv) for vv in value])
+
+ if isinstance(value, str):
+ return value
+
+ try:
+ return json.dumps(value, ensure_ascii=False)
+ except Exception:
+ if fallback_to_str:
+ return str(value)
+ return ""
+
def get_input_elements(self) -> dict[str, Any]:
return self.get_input_elements_from_text("".join(self._param.content))
- def get_kwargs(self, script:str, kwargs:dict = {}, delimiter:str=None) -> tuple[str, dict[str, str | list | Any]]:
+ def get_kwargs(
+ self,
+ script: str,
+ kwargs: dict = {},
+ delimiter: str = None,
+ downloads: list[dict[str, Any]] | None = None,
+ ) -> tuple[str, dict[str, str | list | Any]]:
for k,v in self.get_input_elements_from_text(script).items():
if k in kwargs:
continue
@@ -84,15 +143,8 @@ class Message(ComponentBase):
else:
for t in iter_obj:
ans += t
- elif isinstance(v, list) and delimiter:
- ans = delimiter.join([str(vv) for vv in v])
- elif not isinstance(v, str):
- try:
- ans = json.dumps(v, ensure_ascii=False)
- except Exception:
- pass
else:
- ans = v
+ ans = self._stringify_message_value(v, delimiter, downloads)
if not ans:
ans = ""
kwargs[k] = ans
@@ -115,6 +167,7 @@ class Message(ComponentBase):
s = 0
all_content = ""
cache = {}
+ downloads = []
for r in re.finditer(self.variable_ref_patt, rand_cnt, flags=re.DOTALL):
if self.check_if_canceled("Message streaming"):
return
@@ -154,11 +207,9 @@ class Message(ComponentBase):
continue
elif inspect.isawaitable(v):
v = await v
- elif not isinstance(v, str):
- try:
- v = json.dumps(v, ensure_ascii=False)
- except Exception:
- v = str(v)
+ v = self._stringify_message_value(
+ v, downloads=downloads, fallback_to_str=True
+ )
yield v
self.set_input_value(exp, v)
all_content += v
@@ -171,6 +222,7 @@ class Message(ComponentBase):
all_content += rand_cnt[s: ]
yield rand_cnt[s: ]
+ self.set_output("downloads", downloads)
self.set_output("content", all_content)
self._convert_content(all_content)
await self._save_to_memory(all_content)
@@ -191,12 +243,14 @@ class Message(ComponentBase):
self.set_output("content", partial(self._stream, rand_cnt))
return
- rand_cnt, kwargs = self.get_kwargs(rand_cnt, kwargs)
+ downloads = []
+ rand_cnt, kwargs = self.get_kwargs(rand_cnt, kwargs, downloads=downloads)
template = _jinja2_sandbox.from_string(rand_cnt)
try:
content = template.render(kwargs)
- except Exception:
- pass
+ except Exception as e:
+ logging.warning(f"Jinja2 template rendering failed: {e}")
+ content = rand_cnt # fallback to unrendered content
if self.check_if_canceled("Message processing"):
return
@@ -204,6 +258,7 @@ class Message(ComponentBase):
for n, v in kwargs.items():
content = re.sub(n, v, content)
+ self.set_output("downloads", downloads)
self.set_output("content", content)
self._convert_content(content)
self._save_to_memory(content)
diff --git a/agent/dsl_migration.py b/agent/dsl_migration.py
index 6fef629376..ca4ee894c3 100644
--- a/agent/dsl_migration.py
+++ b/agent/dsl_migration.py
@@ -22,6 +22,7 @@ import re
COMPONENT_RENAMES = {
"Splitter": "TokenChunker",
"HierarchicalMerger": "TitleChunker",
+ "PDFGenerator": "DocGenerator",
}
NODE_TYPE_RENAMES = {
diff --git a/api/apps/user_app.py b/api/apps/user_app.py
index 702e1bd855..7424899269 100644
--- a/api/apps/user_app.py
+++ b/api/apps/user_app.py
@@ -1029,7 +1029,6 @@ async def forget_reset_password():
new_pwd_string = base64.b64decode(new_pwd_base64).decode('utf-8')
new_pwd2_string = base64.b64decode(decrypt(new_pwd2)).decode('utf-8')
- REDIS_CONN.get(_verified_key(email))
if not REDIS_CONN.get(_verified_key(email)):
return get_json_result(data=False, code=RetCode.AUTHENTICATION_ERROR, message="email not verified")
diff --git a/docs/guides/agent/agent_component_reference/docs_generator.md b/docs/guides/agent/agent_component_reference/docs_generator.md
deleted file mode 100644
index 3ed8e342af..0000000000
--- a/docs/guides/agent/agent_component_reference/docs_generator.md
+++ /dev/null
@@ -1,241 +0,0 @@
----
-sidebar_position: 35
-slug: /docs_generator
----
-
-# Docs Generator component
-
-A component that generates downloadable PDF, DOCX, or TXT documents from markdown-style content with full Unicode support.
-
----
-
-The **Docs Generator** component enables you to create professional documents directly within your agent workflow. It accepts markdown-formatted text and converts it into downloadable files, making it ideal for generating reports, summaries, or any structured document output.
-
-## Key features
-
-- **Multiple output formats**: PDF, DOCX, and TXT
-- **Full Unicode support**: Automatic font switching for CJK (Chinese, Japanese, Korean), Arabic, Hebrew, and other non-Latin scripts
-- **Rich formatting**: Headers, lists, tables, code blocks, and more
-- **Customizable styling**: Fonts, margins, page size, and orientation
-- **Document extras**: Logo, watermark, page numbers, and timestamps
-- **Direct download**: Generates a download button for the chat interface
-
-## Prerequisites
-
-- Content to be converted into a document (typically from an **Agent** or other text-generating component).
-
-## Examples
-
-You can pair an **Agent** component with the **Docs Generator** to create dynamic documents based on user queries. The **Agent** generates the content, and the **Docs Generator** converts it into a downloadable file. Connect the output to a **Message** component to display the download button in the chat.
-
-A typical workflow looks like:
-
-```
-Begin → Agent → Docs Generator → Message
-```
-
-In the **Message** component, reference the `download` output variable from the **Docs Generator** to display a download button in the chat interface.
-
-## Configurations
-
-### Content
-
-The main text content to include in the document. Supports Markdown formatting:
-
-- **Bold**: `**text**` or `__text__`
-- **Italic**: `*text*` or `_text_`
-- **Inline code**: `` `code` ``
-- **Headings**: `# Heading 1`, `## Heading 2`, `### Heading 3`
-- **Bullet lists**: `- item` or `* item`
-- **Numbered lists**: `1. item`
-- **Tables**: `| Column 1 | Column 2 |`
-- **Horizontal lines**: `---`
-- **Code blocks**: ` ``` code ``` `
-
-:::tip NOTE
-Click **(x)** or type `/` to insert variables from upstream components.
-:::
-
-### Title
-
-Optional. The document title displayed at the top of the generated file.
-
-### Subtitle
-
-Optional. A subtitle displayed below the title.
-
-### Output format
-
-The file format for the generated document:
-
-- **PDF** (default): Portable Document Format with full styling support.
-- **DOCX**: Microsoft Word format.
-- **TXT**: Plain text format.
-
-### Logo image
-
-Optional. A logo image to display at the top of the document. You can either:
-
-- Upload an image file using the file picker
-- Paste an image path, URL, or base64-encoded data
-
-### Logo position
-
-The horizontal position of the logo:
-
-- **left** (default)
-- **center**
-- **right**
-
-### Logo dimensions
-
-- **Logo width**: Width in inches (default: `2.0`)
-- **Logo height**: Height in inches (default: `1.0`)
-
-### Font family
-
-The font used throughout the document:
-
-- **Helvetica** (default)
-- **Times-Roman**
-- **Courier**
-- **Helvetica-Bold**
-- **Times-Bold**
-
-### Font size
-
-The base font size in points. Defaults to `12`.
-
-### Title font size
-
-The font size for the document title. Defaults to `24`.
-
-### Page size
-
-The paper size for the document:
-
-- **A4** (default)
-- **Letter**
-
-### Orientation
-
-The page orientation:
-
-- **Portrait** (default)
-- **Landscape**
-
-### Margins
-
-Page margins in inches:
-
-- **Margin top**: Defaults to `1.0`
-- **Margin bottom**: Defaults to `1.0`
-- **Margin left**: Defaults to `1.0`
-- **Margin right**: Defaults to `1.0`
-
-### Filename
-
-Optional. Custom filename for the generated document. If left empty, a filename is auto-generated with a timestamp.
-
-### Output directory
-
-The server directory where generated documents are saved. Defaults to `/tmp/pdf_outputs`.
-
-### Add page numbers
-
-When enabled, page numbers are added to the footer of each page. Defaults to `true`.
-
-### Add timestamp
-
-When enabled, a generation timestamp is added to the document footer. Defaults to `true`.
-
-### Watermark text
-
-Optional. Text to display as a diagonal watermark across each page. Useful for marking documents as "Draft", "Confidential", etc.
-
-## Output
-
-The **Docs Generator** component provides the following output variables:
-
-| Variable name | Type | Description |
-|---------------|-----------|--------------------------------------------------------------|
-| `file_path` | `string` | The server path where the generated document is saved. |
-| `pdf_base64` | `string` | The document content encoded in base64 format. |
-| `download` | `string` | JSON containing download information for the chat interface. |
-| `success` | `boolean` | Indicates whether the document was generated successfully. |
-
-### Displaying the download button
-
-To display a download button in the chat, add a **Message** component after the **Docs Generator** and reference the `download` variable:
-
-1. Connect the **Docs Generator** output to a **Message** component.
-2. In the **Message** component's content field, type `/` and select `{Docs Generator_0@download}`.
-3. When the agent runs, a download button will appear in the chat, allowing users to download the generated document.
-
-The download button automatically handles:
-- File type detection (PDF, DOCX, TXT)
-- Proper MIME type for browser downloads
-- Base64 decoding for direct file delivery
-
-## Unicode and multi-language support
-
-The **Docs Generator** includes intelligent font handling for international content:
-
-### How it works
-
-1. **Content analysis**: The component scans the text for non-Latin characters.
-2. **Automatic font switching**: When CJK or other complex scripts are detected, the system automatically switches to a compatible CID font (STSong-Light for Chinese, HeiseiMin-W3 for Japanese, HYSMyeongJo-Medium for Korean).
-3. **Latin content**: For documents containing only Latin characters (including extended Latin, Cyrillic, and Greek), the user-selected font family is used.
-
-### Supported scripts
-
-| Script | Unicode Range | Font Used |
-|------------------------------|---------------|--------------------|
-| Chinese (CJK) | U+4E00–U+9FFF | STSong-Light |
-| Japanese (Hiragana/Katakana) | U+3040–U+30FF | HeiseiMin-W3 |
-| Korean (Hangul) | U+AC00–U+D7AF | HYSMyeongJo-Medium |
-| Arabic | U+0600–U+06FF | CID font fallback |
-| Hebrew | U+0590–U+05FF | CID font fallback |
-| Devanagari (Hindi) | U+0900–U+097F | CID font fallback |
-| Thai | U+0E00–U+0E7F | CID font fallback |
-
-### Font installation
-
-For full multi-language support in self-hosted deployments, ensure Unicode fonts are installed:
-
-**Linux (Debian/Ubuntu):**
-```bash
-apt-get install fonts-freefont-ttf fonts-noto-cjk
-```
-
-**Docker:** The official RAGFlow Docker image includes these fonts. For custom images, add the font packages to your Dockerfile:
-```dockerfile
-RUN apt-get update && apt-get install -y fonts-freefont-ttf fonts-noto-cjk
-```
-
-:::tip NOTE
-CID fonts (STSong-Light, HeiseiMin-W3, etc.) are built into ReportLab and do not require additional installation. They are used automatically when CJK content is detected.
-:::
-
-## Troubleshooting
-
-### Characters appear as boxes or question marks
-
-This indicates missing font support. Ensure:
-1. The content contains supported Unicode characters.
-2. For self-hosted deployments, Unicode fonts are installed on the server.
-3. The document is being viewed in a PDF reader that supports embedded fonts.
-
-### Download button not appearing
-
-Ensure:
-1. The **Message** component is connected after the **Docs Generator**.
-2. The `download` variable is correctly referenced using `/` (which appears as `{Docs Generator_0@download}` when copied).
-3. The document generation completed successfully (check `success` output).
-
-### Large tables not rendering correctly
-
-For tables with many columns or large cell content:
-- The component automatically converts wide tables to a definition list format for better readability.
-- Consider splitting large tables into multiple smaller tables.
-- Use landscape orientation for wide tables.
diff --git a/web/src/components/document-download-button/index.tsx b/web/src/components/document-download-button/index.tsx
new file mode 100644
index 0000000000..02eefdd461
--- /dev/null
+++ b/web/src/components/document-download-button/index.tsx
@@ -0,0 +1,88 @@
+import { Button } from '@/components/ui/button';
+import { IDocumentDownloadInfo } from '@/interfaces/database/chat';
+import { downloadFile } from '@/services/file-manager-service';
+import { downloadFileFromBlob } from '@/utils/file-util';
+import { Download, FileText } from 'lucide-react';
+import { useCallback } from 'react';
+
+export type DocumentDownloadInfo = IDocumentDownloadInfo;
+
+interface DocumentDownloadButtonProps {
+ downloadInfo: DocumentDownloadInfo;
+ className?: string;
+}
+
+export function DocumentDownloadButton({
+ downloadInfo,
+ className,
+}: DocumentDownloadButtonProps) {
+ const handleDownload = useCallback(async () => {
+ try {
+ const ext =
+ downloadInfo.filename.split('.').pop()?.toLowerCase() || 'bin';
+ const response = await downloadFile({
+ docId: downloadInfo.doc_id,
+ ext,
+ });
+ const blob = new Blob([response.data], {
+ type: downloadInfo.mime_type || response.data.type,
+ });
+ downloadFileFromBlob(blob, downloadInfo.filename);
+ } catch (error) {
+ console.error('Error downloading document:', error);
+ }
+ }, [downloadInfo]);
+
+ const getDocumentType = () => {
+ if (downloadInfo.mime_type === 'application/pdf') return 'PDF Document';
+ if (
+ downloadInfo.mime_type ===
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
+ )
+ return 'Word Document';
+ if (
+ downloadInfo.mime_type ===
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
+ )
+ return 'Excel Document';
+ if (downloadInfo.mime_type === 'text/plain') return 'Text Document';
+ if (downloadInfo.mime_type === 'text/markdown') return 'Markdown Document';
+ if (downloadInfo.mime_type === 'text/html') return 'HTML Document';
+
+ const ext = downloadInfo.filename.split('.').pop()?.toUpperCase();
+ if (ext === 'PDF') return 'PDF Document';
+ if (ext === 'DOCX') return 'Word Document';
+ if (ext === 'XLSX') return 'Excel Document';
+ if (ext === 'TXT') return 'Text Document';
+ if (ext === 'MD') return 'Markdown Document';
+ if (ext === 'HTML' || ext === 'HTM') return 'HTML Document';
+
+ return 'Document';
+ };
+
+ return (
+
+
+
+
+ {downloadInfo.filename}
+
+
{getDocumentType()}
+
+
+
+ );
+}
diff --git a/web/src/components/message-item/index.tsx b/web/src/components/message-item/index.tsx
index 4f8814a2d1..25a68e22fe 100644
--- a/web/src/components/message-item/index.tsx
+++ b/web/src/components/message-item/index.tsx
@@ -10,15 +10,11 @@ import { memo, useCallback, useMemo } from 'react';
import { IRegenerateMessage, IRemoveMessageById } from '@/hooks/logic-hooks';
import { cn } from '@/lib/utils';
+import { DocumentDownloadButton } from '../document-download-button';
import MarkdownContent from '../markdown-content';
import { ReferenceDocumentList } from '../next-message-item/reference-document-list';
import { ReferenceImageList } from '../next-message-item/reference-image-list';
import { UploadedMessageFiles } from '../next-message-item/uploaded-message-files';
-import {
- PDFDownloadButton,
- extractPDFDownloadInfo,
- removePDFDownloadInfo,
-} from '../pdf-download-button';
import { RAGFlowAvatar } from '../ragflow-avatar';
import SvgIcon from '../svg-icon';
import { useTheme } from '../theme-provider';
@@ -67,19 +63,11 @@ const MessageItem = ({
return reference?.doc_aggs ?? [];
}, [reference?.doc_aggs]);
- // Extract PDF download info from message content
- const pdfDownloadInfo = useMemo(
- () => extractPDFDownloadInfo(item.content),
- [item.content],
+ const documentDownloadInfos = useMemo(
+ () => item.downloads ?? [],
+ [item.downloads],
);
-
- // If we have PDF download info, extract the remaining text
- const messageContent = useMemo(() => {
- if (!pdfDownloadInfo) return item.content;
-
- // Remove the JSON part from the content to avoid showing it
- return removePDFDownloadInfo(item.content, pdfDownloadInfo);
- }, [item.content, pdfDownloadInfo]);
+ const messageContent = item.content;
const handleRegenerateMessage = useCallback(() => {
regenerateMessage?.(item);
@@ -129,7 +117,7 @@ const MessageItem = ({
index !== 0 && (
)}
- {/* Show PDF download button if download info is present */}
- {pdfDownloadInfo && (
-
- )}
{/* Show message content if there's any text besides the download */}
{messageContent && (
)}
+ {documentDownloadInfos.length > 0 && (
+
+ {documentDownloadInfos.map((downloadInfo, index) => (
+
+ ))}
+
+ )}
diff --git a/web/src/components/next-message-item/index.tsx b/web/src/components/next-message-item/index.tsx
index b9e1b32324..ac63a6661a 100644
--- a/web/src/components/next-message-item/index.tsx
+++ b/web/src/components/next-message-item/index.tsx
@@ -25,12 +25,8 @@ import { citationMarkerReg } from '@/utils/citation-utils';
import { getDirAttribute } from '@/utils/text-direction';
import { isEmpty } from 'lodash';
import { Atom, ChevronDown, ChevronUp } from 'lucide-react';
+import { DocumentDownloadButton } from '../document-download-button';
import MarkdownContent from '../next-markdown-content';
-import {
- PDFDownloadButton,
- extractPDFDownloadInfo,
- removePDFDownloadInfo,
-} from '../pdf-download-button';
import { RAGFlowAvatar } from '../ragflow-avatar';
import SvgIcon from '../svg-icon';
import { useTheme } from '../theme-provider';
@@ -102,19 +98,11 @@ function MessageItem({
return Object.values(docs);
}, [reference?.doc_aggs]);
- // Extract PDF download info from message content
- const pdfDownloadInfo = useMemo(
- () => extractPDFDownloadInfo(item.content),
- [item.content],
+ const documentDownloadInfos = useMemo(
+ () => item.downloads ?? [],
+ [item.downloads],
);
-
- // If we have PDF download info, extract the remaining text
- const messageContent = useMemo(() => {
- if (!pdfDownloadInfo) return item.content;
-
- // Remove the JSON part from the content to avoid showing it
- return removePDFDownloadInfo(item.content, pdfDownloadInfo);
- }, [item.content, pdfDownloadInfo]);
+ const messageContent = item.content;
const handleRegenerateMessage = useCallback(() => {
regenerateMessage?.(item);
@@ -137,9 +125,7 @@ function MessageItem({
);
const renderContent = useCallback(() => {
- /* Show message content if there's any text besides the download */
-
- if (pdfDownloadInfo) {
+ if (!messageContent && !(item.data || (sendLoading && !isShare))) {
return null;
}
@@ -175,7 +161,6 @@ function MessageItem({
item.data,
loading,
messageContent,
- pdfDownloadInfo,
reference,
sendLoading,
theme,
@@ -239,7 +224,7 @@ function MessageItem({
{isShare && !sendLoading && !isEmpty(item.content) && (
) : (
)}
- {/* Show PDF download button if download info is present */}
- {pdfDownloadInfo && (
-
- )}
-
{renderContent()}
{isAssistant && (
@@ -320,6 +297,16 @@ function MessageItem({
files={item.files as File[] | UploadResponseDataType[]}
>
)}
+ {documentDownloadInfos.length > 0 && (
+
+ {documentDownloadInfos.map((downloadInfo, index) => (
+
+ ))}
+
+ )}
{/* {isAssistant && item.attachment && item.attachment.doc_id && (