mirror of
https://github.com/zebrajr/localGPT.git
synced 2025-12-06 00:20:19 +01:00
feat: Add TXT and MD file format support to DocumentConverter
- Add .txt and .md extensions to SUPPORTED_FORMATS mapping - Add _convert_txt_to_markdown method for plain text files - Support docling's native MD InputFormat for markdown files - Add proper format detection and routing logic - Preserve existing PDF OCR detection and multi-format support Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
This commit is contained in:
parent
d5929ce29b
commit
583c72e340
|
|
@ -17,6 +17,8 @@ class DocumentConverter:
|
|||
'.docx': InputFormat.DOCX,
|
||||
'.html': InputFormat.HTML,
|
||||
'.htm': InputFormat.HTML,
|
||||
'.md': InputFormat.MD,
|
||||
'.txt': 'TXT', # Special handling for plain text files
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
|
|
@ -67,6 +69,8 @@ class DocumentConverter:
|
|||
|
||||
if input_format == InputFormat.PDF:
|
||||
return self._convert_pdf_to_markdown(file_path)
|
||||
elif input_format == 'TXT':
|
||||
return self._convert_txt_to_markdown(file_path)
|
||||
else:
|
||||
return self._convert_general_to_markdown(file_path, input_format)
|
||||
|
||||
|
|
@ -90,6 +94,22 @@ class DocumentConverter:
|
|||
print(f"Converting {pdf_path} to Markdown using docling {ocr_msg}...")
|
||||
return self._perform_conversion(pdf_path, converter, ocr_msg)
|
||||
|
||||
def _convert_txt_to_markdown(self, file_path: str) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
"""Convert plain text files to markdown by reading content directly."""
|
||||
print(f"Converting {file_path} (TXT) to Markdown...")
|
||||
try:
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
content = f.read()
|
||||
|
||||
markdown_content = f"```\n{content}\n```"
|
||||
metadata = {"source": file_path}
|
||||
|
||||
print(f"Successfully converted {file_path} (TXT) to Markdown.")
|
||||
return [(markdown_content, metadata)]
|
||||
except Exception as e:
|
||||
print(f"Error processing TXT file {file_path}: {e}")
|
||||
return []
|
||||
|
||||
def _convert_general_to_markdown(self, file_path: str, input_format: InputFormat) -> List[Tuple[str, Dict[str, Any]]]:
|
||||
"""Convert non-PDF formats using general converter."""
|
||||
print(f"Converting {file_path} ({input_format.name}) to Markdown using docling...")
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user