fix: normalize excessive whitespace in streaming markdown responses

- Create comprehensive text normalization utility to clean up excessive newlines - Apply normalization to streaming tokens in session-chat.tsx - Apply normalization to rendered text in conversation-page.tsx - Add test case demonstrating the fix for excessive empty lines - Preserve proper markdown formatting while removing visual gaps Co-Authored-By: PromptEngineer <jnfarooq@outlook.com>
2025-12-06 12:20:53 +01:00 · 2025-07-15 07:30:00 +00:00 · 2025-07-15 07:30:00 +00:00 · dc9722de28
commit dc9722de28
parent 6af1165894
4 changed files with 155 additions and 12 deletions
--- a/src/components/ui/conversation-page.tsx
+++ b/src/components/ui/conversation-page.tsx
@ -10,6 +10,7 @@ import { ScrollArea } from "@/components/ui/scroll-area"
 import { ChatMessage } from "@/lib/api"
 import { cn } from "@/lib/utils"
 import Markdown from "@/components/Markdown"
+import { normalizeWhitespace } from "@/utils/textNormalization"

 interface ConversationPageProps {
  messages: ChatMessage[]
@ -110,7 +111,7 @@ function ThinkingText({ text }: { text: string }) {
        </details>
      )}
      {visibleText.trim() && (
-        <Markdown text={visibleText} className="whitespace-pre-wrap" />
+        <Markdown text={normalizeWhitespace(visibleText)} className="whitespace-pre-wrap" />
      )}
    </>
  );
@ -151,7 +152,7 @@ function StructuredMessageBlock({ content }: { content: Array<Record<string, any
              {step.key === 'final' && step.details && typeof step.details === 'object' && !Array.isArray(step.details) ? (
                <div className="space-y-3">
                  <div className="whitespace-pre-wrap text-gray-100">
-                    <ThinkingText text={step.details.answer} />
+                    <ThinkingText text={normalizeWhitespace(step.details.answer)} />
                  </div>
                  {!hasSubAnswers && step.details.source_documents && step.details.source_documents.length > 0 && (
                    <CitationsBlock docs={step.details.source_documents} />
@ -159,7 +160,7 @@ function StructuredMessageBlock({ content }: { content: Array<Record<string, any
                </div>
              ) : step.key === 'final' && step.details && typeof step.details === 'string' ? (
                <div className="whitespace-pre-wrap text-gray-100">
-                  <ThinkingText text={step.details} />
+                  <ThinkingText text={normalizeWhitespace(step.details)} />
                </div>
              ) : Array.isArray(step.details) ? (
                step.key === 'decompose' && step.details.every((d: any)=> typeof d === 'string') ? (
@ -175,7 +176,7 @@ function StructuredMessageBlock({ content }: { content: Array<Record<string, any
                    {step.details.map((detail: any, idx: number) => (
                      <div key={idx} className="border-l-2 border-blue-400 pl-2">
                        <div className="font-semibold">{detail.question}</div>
-                        <div><ThinkingText text={detail.answer} /></div>
+                        <div><ThinkingText text={normalizeWhitespace(detail.answer)} /></div>
                        {detail.source_documents && detail.source_documents.length > 0 && (
                          <CitationsBlock docs={detail.source_documents} />
                        )}
@ -185,7 +186,7 @@ function StructuredMessageBlock({ content }: { content: Array<Record<string, any
                )
              ) : (
                // Handle string details
-                <ThinkingText text={step.details as string} />
+                <ThinkingText text={normalizeWhitespace(step.details as string)} />
              )}
            </div>
          );
@ -327,7 +328,7 @@ export function ConversationPage({
                      ) : (
                        <div className="whitespace-pre-wrap text-base leading-relaxed">
                          {typeof message.content === 'string' 
-                              ? <ThinkingText text={message.content} />
+                              ? <ThinkingText text={normalizeWhitespace(message.content)} />
                              : <StructuredMessageBlock content={message.content} />
                          }
                        </div>
--- a/src/components/ui/session-chat.tsx
+++ b/src/components/ui/session-chat.tsx
@ -7,6 +7,7 @@ import { EmptyChatState } from "./empty-chat-state"
 import { ChatMessage, ChatSession, chatAPI, generateUUID } from "@/lib/api"
 import { AttachedFile } from "@/lib/types"
 import { useEffect, useState, forwardRef, useImperativeHandle, useCallback } from "react"
+import { normalizeStreamingToken } from "@/utils/textNormalization"
 import { Button } from "./button"
 import type { Step } from '@/lib/api'
 import { ChatSettingsModal } from '@/components/ui/chat-settings-modal'
@ -368,8 +369,7 @@ export const SessionChat = forwardRef<SessionChatRef, SessionChatProps>(({
                  return m; // skip empty/whitespace-only chunks
                }
                let updated = current.endsWith(tok) ? current : current + tok;
-                // Clean up excessive newlines
-                updated = updated.replace(/\n{3,}/g, '\n\n');
+                updated = normalizeStreamingToken('', updated);
                if (steps[finalIdx].key === 'direct') {
                  steps[0].details = updated;
                } else {
@ -396,8 +396,7 @@ export const SessionChat = forwardRef<SessionChatRef, SessionChatProps>(({
                const curAns: string = detailsArr[idx].answer || '';
                if (!curAns.endsWith(tok)) {
                  let updatedAnswer = curAns + tok;
-                  // Clean up excessive newlines
-                  updatedAnswer = updatedAnswer.replace(/\n{3,}/g, '\n\n');
+                  updatedAnswer = normalizeStreamingToken('', updatedAnswer);
                  detailsArr[idx].answer = updatedAnswer;
                }
                steps[5].details = detailsArr;
--- a/src/utils/textNormalization.ts
+++ b/src/utils/textNormalization.ts
@ -0,0 +1,63 @@
+/**
+ * Comprehensive text normalization utility for cleaning up excessive whitespace
+ * in streaming markdown responses to prevent large visual gaps in the UI.
+ */
+
+export function normalizeWhitespace(text: string): string {
+  if (!text || typeof text !== 'string') {
+    return '';
+  }
+
+  text = text.replace(/\n{3,}/g, '\n\n');
+  
+  text = text.replace(/[ \t]+$/gm, '');
+  
+  text = text.replace(/[ \t]{3,}/g, ' ');
+  
+  text = text.replace(/[ \t]*\n[ \t]*\n[ \t]*\n/g, '\n\n');
+  
+  text = text.replace(/[ \t]+\n/g, '\n');
+  
+  text = text.trim();
+  
+  return text;
+}
+
+/**
+ * Specialized normalization for streaming tokens to prevent accumulation
+ * of excessive whitespace during real-time text generation.
+ */
+export function normalizeStreamingToken(currentText: string, newToken: string): string {
+  if (!newToken || typeof newToken !== 'string') {
+    return currentText;
+  }
+
+  let combined = currentText + newToken;
+  
+  combined = normalizeWhitespace(combined);
+  
+  return combined;
+}
+
+/**
+ * Check if text contains excessive whitespace that needs normalization
+ */
+export function hasExcessiveWhitespace(text: string): boolean {
+  if (!text || typeof text !== 'string') {
+    return false;
+  }
+  
+  if (/\n{3,}/.test(text)) {
+    return true;
+  }
+  
+  if (/[ \t]{3,}/.test(text)) {
+    return true;
+  }
+  
+  if (/[ \t]*\n[ \t]*\n[ \t]*\n/.test(text)) {
+    return true;
+  }
+  
+  return false;
+}
--- a/test_markdown_streaming.js
+++ b/test_markdown_streaming.js
@ -0,0 +1,80 @@
+
+const testMarkdownWithExcessiveNewlines = `# Test Response
+
+This is a test response with excessive newlines.
+
+
+
+Here's some content after multiple empty lines.
+
+
+
+
+## Section Header
+
+More content here.
+
+
+
+
+
+
+### Subsection
+
+Final content with lots of spacing.
+
+
+
+
+The end.`;
+
+const testStreamingTokens = [
+  "# Test Response\n\n",
+  "This is a test response",
+  " with excessive newlines.\n\n\n\n",
+  "Here's some content after",
+  " multiple empty lines.\n\n\n\n\n",
+  "## Section Header\n\n",
+  "More content here.\n\n\n\n\n\n\n",
+  "### Subsection\n\n",
+  "Final content with lots",
+  " of spacing.\n\n\n\n\n",
+  "The end."
+];
+
+function currentCleanup(text) {
+  return text.replace(/\n{3,}/g, '\n\n');
+}
+
+function improvedCleanup(text) {
+  text = text.replace(/\n{3,}/g, '\n\n');
+  
+  text = text.replace(/[ \t]+$/gm, '');
+  
+  text = text.replace(/[ \t]{3,}/g, ' ');
+  
+  text = text.replace(/[ \t]*\n[ \t]*\n[ \t]*\n/g, '\n\n');
+  
+  text = text.trim();
+  
+  return text;
+}
+
+console.log("=== ORIGINAL TEXT ===");
+console.log(JSON.stringify(testMarkdownWithExcessiveNewlines));
+
+console.log("\n=== CURRENT CLEANUP ===");
+console.log(JSON.stringify(currentCleanup(testMarkdownWithExcessiveNewlines)));
+
+console.log("\n=== IMPROVED CLEANUP ===");
+console.log(JSON.stringify(improvedCleanup(testMarkdownWithExcessiveNewlines)));
+
+console.log("\n=== STREAMING SIMULATION ===");
+let streamedText = "";
+testStreamingTokens.forEach((token, i) => {
+  streamedText += token;
+  console.log(`Token ${i + 1}: "${token}"`);
+  console.log(`Accumulated (current): "${currentCleanup(streamedText)}"`);
+  console.log(`Accumulated (improved): "${improvedCleanup(streamedText)}"`);
+  console.log("---");
+});