Browse Source

Created a PDF Reader and bare minimum Prompt builder.
Resturctured Main Chatbot file

eliasCybob 5 months ago
parent
commit
3d214edfe4
4 changed files with 270 additions and 67 deletions
  1. 17 3
      api/CSVReader.php
  2. 68 0
      api/PDFReader.php
  3. 140 0
      api/PromptBuilder.php
  4. 45 64
      api/chatbot.php

+ 17 - 3
api/CsvReader.php → api/CSVReader.php

@@ -2,10 +2,10 @@
 
 
 
 
 /**
 /**
- * CsvReader class to read CSV files from a specified directory.
+ * CSVReader class to read CSV files from a specified directory.
  * It reads all CSV files, combines their headers, and returns an array of rows.
  * It reads all CSV files, combines their headers, and returns an array of rows.
  */
  */
-class CsvReader {
+class CSVReader {
 
 
     /**
     /**
      * Directory where CSV files are stored.
      * Directory where CSV files are stored.
@@ -14,7 +14,7 @@ class CsvReader {
     private $dir;
     private $dir;
 
 
     /**
     /**
-     * Constructor to initialize the CsvReader with a directory.
+     * Constructor to initialize the CSVReader with a directory.
      *
      *
      * @param string $directory The directory containing CSV files.
      * @param string $directory The directory containing CSV files.
      */
      */
@@ -42,4 +42,18 @@ class CsvReader {
         }
         }
         return $rows;
         return $rows;
     }
     }
+
+    /**
+     * Converts an array of CSV rows into a string format.
+     *
+     * @param array $csvRows An array of associative arrays representing the CSV rows.
+     * @return string A string representation of the CSV rows.
+     */
+    public function toString($csvRows) {
+        $csvString = '';
+        foreach ($csvRows as $csvRow) {
+            $csvString .= "- " . implode(" | ", $csvRow) . "\n";
+        }
+        return $csvString;
+    }
 }
 }

+ 68 - 0
api/PDFReader.php

@@ -0,0 +1,68 @@
+<?php
+
+/**
+ * PDFReader class to read PDF files from a specified directory.
+ * It reads all PDF files, extracts their text, and returns an array of rows (one per file).
+ */
+class PDFReader {
+    /**
+     * Directory where PDF files are stored.
+     * @var string
+     */
+    private $dir;
+
+    /**
+     * Constructor to initialize the PDFReader with a directory.
+     *
+     * @param string $directory The directory containing PDF files.
+     */
+    public function __construct($directory) {
+        $this->dir = rtrim($directory, '/') . '/';
+    }
+
+    /**
+     * Reads all PDF files in the specified directory and returns an array of extracted texts.
+     *
+     * @return array An array of associative arrays with filename and extracted text.
+     */
+    public function readAll() {
+        $rows = array();
+        foreach (glob($this->dir . '*.pdf') as $file) {
+            $text = $this->readFile($file);
+            $rows[] = [
+                'filename' => basename($file),
+                'text' => $text
+            ];
+        }
+        return $rows;
+    }
+
+    /**
+     * Extract text from a PDF file using pdftotext command line tool.
+     *
+     * @param string $file Path to the PDF file (relative or absolute).
+     * @return string Extracted text from the PDF.
+     */
+    public function readFile($file) {
+        $filePath = strpos($file, $this->dir) === 0 ? $file : $this->dir . ltrim($file, '/');
+        $tmp = tempnam(sys_get_temp_dir(), 'pdftext');
+        exec("pdftotext " . escapeshellarg($filePath) . " " . escapeshellarg($tmp));
+        $text = file_get_contents($tmp);
+        unlink($tmp);
+        return $text ?: '';
+    }
+
+    /**
+     * Converts an array of PDF rows into a string format.
+     *
+     * @param array $pdfRows An array of associative arrays representing the PDF rows.
+     * @return string A string representation of the PDF rows.
+     */
+    public function toString($pdfRows) {
+        $pdfString = '';
+        foreach ($pdfRows as $pdfRow) {
+            $pdfString .= "- " . $pdfRow['filename'] . ":\n" . trim($pdfRow['text']) . "\n\n";
+        }
+        return $pdfString;
+    }
+}

+ 140 - 0
api/PromptBuilder.php

@@ -0,0 +1,140 @@
+<?php
+
+/**
+ * PromptBuilder class to build prompts for a chatbot.
+ * It allows setting roles, questions, and context, and builds a prompt string or data for API requests.
+ */
+class PromptBuilder
+{
+    /**
+     * Role of the chatbot, e.g., "You are a helpful assistant."
+     * @var string
+     */
+    private $role = "";
+
+    /**
+     * Question asked by the user, e.g., "What is the weather today?"
+     * @var string
+     */
+    private $question = "";
+
+    /**
+     * Context for the chatbot, e.g., additional information or data to consider.
+     * @var string
+     */
+    private $context = "";
+
+    /**
+     * Allow context to be used in the prompt.
+     * If false, the context will not be included in the prompt.
+     * @var bool
+     */
+    private $allowContext = true;
+
+    /**
+     * Returns the role of the chatbot.
+     * @return string The role of the chatbot.
+     */
+    public function getRole(){
+        return $this->role;
+    }
+
+    /**
+     * Returns the question of the chatbot.
+     * @return string The question of the chatbot.
+     */
+    public function getQuestion(){
+        return $this->question;
+    }
+
+    /**
+     * Returns the context of the chatbot.
+     * @return string The context of the chatbot.
+     */
+    public function getContext(){
+        return $this->context;
+    }
+
+    /**
+     * Returns whether the context is allowed in the prompt.
+     * @return bool True if context is allowed, false otherwise.
+     */
+    public function getContextAllowness(){
+        return $this->allowContext;
+    }
+
+    /**
+     * Sets the role of the chatbot.
+     * @param string $role The role to set for the chatbot.
+     */
+    public function setRole($role){
+        $this->role = $role;
+    }
+
+    /**
+     * Sets the question for the chatbot.
+     * @param string $question The question to set for the chatbot.
+     */
+    public function setQuestion($question){
+        $this->question = $question;
+    }
+
+    /**
+     * Sets the context for the chatbot.
+     * @param string $context The context to set for the chatbot.
+     */
+    public function setContext($context){
+        //TODO: Better context handling in order to bypass token limits
+        // Maybe require an Array of context items? (MCP Like?)
+        $this->context = $context;
+
+    }
+
+    /**
+     * Sets whether the context is allowed in the prompt.
+     * @param bool $allowContext True to allow context, false to disallow.
+     */
+    public function setContextAllowness($allowContext){
+        $this->allowContext = $allowContext;
+    }
+
+    /**
+     * Builds the prompt string for the chatbot.
+     * This includes the role, context (if allowed), and question.
+     * @return string The complete prompt string.
+     */
+    public function buildPromptString(){
+        $prompt = "Deine Rolle ist: " . $this->role;
+
+        if ($this->allowContext) {
+            $prompt .= "\nDu hast folgenden Kontext, auf den du dich NUR fokussierst: " . $this->context;
+        }
+
+        $prompt .= "\nDies ist die Frage des Nutzers: " . $this->question;
+        $prompt .= "\nBeantworte die Frage so gut es geht\n";
+        return $prompt;
+    }
+
+    /**
+     * Builds the prompt data for API requests.
+     * This includes the model, messages (role, question, context), and other parameters.
+     * @return string JSON encoded prompt data.
+     */
+    public function buildPromptData(){
+        $messages = [
+            ["role" => "system", "content" => $this->getRole()],
+            ["role" => "user", "content" => $this->getQuestion()],
+        ];
+        // add context if allowed
+        if ($this->allowContext) {
+            $messages[] = ["role" => "context", "content" => $this->getContext()];
+        }
+        return json_encode([
+            "model" => "deepseek-llm-7b-chat",
+            "messages" => $messages,
+            "temperature" => 0.5,
+            "max_tokens" => -1,
+            "stream" => false
+        ]);
+    }
+}

+ 45 - 64
api/chatbot.php

@@ -1,9 +1,17 @@
 <?php
 <?php
-require_once 'CsvReader.php';
-set_time_limit(2500); // 2 Minuten erlauben
+
+require_once 'CSVReader.php';
+require_once 'PDFReader.php';
+require_once 'PromptBuilder.php';
+
+
+// 2 Minuten erlauben
+set_time_limit(2500);
+
 
 
 header('Content-Type: application/json');
 header('Content-Type: application/json');
 
 
+
 // only accept POST requests
 // only accept POST requests
 if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
 if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
     http_response_code(405);
     http_response_code(405);
@@ -12,75 +20,52 @@ if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
 }
 }
 
 
 
 
-/** Extract text from a PDF file using pdftotext command line tool.
- *
- * @param string $file Path to the PDF file.
- * @return string Extracted text from the PDF.
- */
-function extractPdfText($file) {
-    $tmp = tempnam(sys_get_temp_dir(), 'pdftext');
-    exec("pdftotext " . escapeshellarg($file) . " " . escapeshellarg($tmp));
-    $text = file_get_contents($tmp);
-    unlink($tmp);
-    return $text ?: '';
-}
-
 // Build prompt for deepseek
 // Build prompt for deepseek
 $input = json_decode(file_get_contents('php://input'), true);
 $input = json_decode(file_get_contents('php://input'), true);
 $question = isset($input['question']) ? $input['question'] : '';
 $question = isset($input['question']) ? $input['question'] : '';
 $roleFile = __DIR__ . '/data/role.txt';
 $roleFile = __DIR__ . '/data/role.txt';
 $role = file_exists($roleFile) ? file_get_contents($roleFile) : '';
 $role = file_exists($roleFile) ? file_get_contents($roleFile) : '';
 
 
-$reader = new CsvReader(__DIR__ . '/data/csv');
-$rows = $reader->readAll();
-$contextText = "";
-foreach ($rows as $row) {
-    $contextText .= "- " . implode(" | ", $row) . "\n";
-}
 
 
-$pdf1 = extractPdfText(__DIR__ . '/data/pdf/onboarding.pdf');
-$pdf2 = extractPdfText(__DIR__ . '/data/pdf/urlaub.pdf');
+// Reading CSV data
+$csvReader = new CSVReader(__DIR__ . '/data/csv');
+$csvRows    = $csvReader->readAll();
+$contextText = $csvReader->toString($csvRows);
+
 
 
-$pdfContext = "\n\nAus Onboarding-Dokument:\n" . $pdf1 . "\n\nAus Urlaubsdokument:\n" . $pdf2;
+// Reading PDF data
+$pdfReader = new PDFReader(__DIR__ . '/data/pdf');
+$pdfData = $pdfReader->readAll();
+$pdfContext = $pdfReader->toString($pdfData);
 
 
-$htmlDir = __DIR__ . '/data/html';
+
+// TODO: Read HTML data
 $htmlContext = '';
 $htmlContext = '';
 
 
-$prompt = "";
-$prompt = "Deine Rolle ist: ". $role;
-$prompt .= "\nDeine Frage ist: " . $question;
-$prompt .= "\nCSV-Daten:\n" . $contextText;
-$prompt .= "\nPDF-Kontext:\n" . $pdfContext;
-$prompt .= "\nHTML-Wissensdatenbank:\n" . $htmlContext;
-
-// ollama stuff
-//$payload = json_encode([
-//    "model" => "deepseek-r1",
-////    "model" => "deepseek-llm",
-////    "model" => "deepseek-coder:6.7b",
-//    "prompt" => $prompt,
-//    "stream" => false,
-//    "stop" => ["</think>"]
-//]);
-//$promptContext = "Kontextdaten:\n\nCSV:\n$contextText\n\nPDF:\n$pdfContext\nHTML:\n$htmlContext";
-$promptContext = "Kontextdaten:\n\nCSV:\n$contextText\n\nPDF:\n$pdfContext";
-error_log("System prompt length: " . strlen($promptContext));
-
-$payload = json_encode([
-    "model" => "deepseek-llm-7b-chat",
-    "messages" => [
-//        ["role" => "system", "content" => "Rolle:\n$role\n\n$promptContext"],
-        ["role" => "system", "content" => "Rolle:\n$role"],
-        ["role" => "user", "content" => $question]
-//        ["role" => "context", "content" => $promptContext]
-    ],
-    "temperature" => 0.5,
-    "max_tokens" => -1,
-    "stream" => false
-]);
+
+$prompt = new PromptBuilder();
+$prompt->setRole($role);
+$prompt->setQuestion($question);
+
+
+$promptContext = "\nCSV-Kontext:\n" . $contextText;
+$promptContext .= "\nPDF-Kontext:\n" . $pdfContext;
+$promptContext .= "\nHTML-Wissensdatenbank:\n" . $htmlContext;
+
+
+$prompt->setContext($contextText);
+
+
+$fullPrompt = $prompt->buildPromptString();
+
+
+error_log("System prompt length: " . strlen($fullPrompt));
+
+
+$payload = $prompt->buildPromptData();
+
+//Starting Prompt Generation
 $start = microtime(true);
 $start = microtime(true);
-// ### Send Request
-//$ch = curl_init('http://localhost:11434/api/generate');
 $ch = curl_init('http://localhost:1234/api/v0/chat/completions');
 $ch = curl_init('http://localhost:1234/api/v0/chat/completions');
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_POST, true);
 curl_setopt($ch, CURLOPT_POST, true);
@@ -91,15 +76,11 @@ curl_close($ch);
 $data = json_decode($response, true);
 $data = json_decode($response, true);
 $end = microtime(true);
 $end = microtime(true);
 
 
-// ### Prepare Response
-//$responseText = isset($data['response']) ? $data['response'] : 'No answer from Deepseek!';
-//$responseText = $data['choices'][0]['message']['content'] ?? 'No answer from Deepseek!';
+
 $responseText = isset($data['choices'][0]['message']['content']) ? $data['choices'][0]['message']['content'] : '';
 $responseText = isset($data['choices'][0]['message']['content']) ? $data['choices'][0]['message']['content'] : '';
-//$responseText = preg_replace('/<think>.*?<\/think>/s', ' ', $responseText);
 $responseText = trim($responseText);
 $responseText = trim($responseText);
 
 
 echo json_encode([
 echo json_encode([
     'reply' => $responseText,
     'reply' => $responseText,
-//    'misc' => $htmlContext,
     'duration_seconds' => round($end - $start, 3)
     'duration_seconds' => round($end - $start, 3)
 ]);
 ]);