Browse Source

Documented Code, Removed useless getEntries.js

eliasCybob 5 months ago
parent
commit
a11c66a893
3 changed files with 29 additions and 41 deletions
  1. 21 0
      api/CsvReader.php
  2. 8 1
      api/chatbot.php
  3. 0 40
      scraper/getEntries.js

+ 21 - 0
api/CsvReader.php

@@ -1,11 +1,32 @@
 <?php
+
+
+/**
+ * CsvReader class to read CSV files from a specified directory.
+ * It reads all CSV files, combines their headers, and returns an array of rows.
+ */
 class CsvReader {
+
+    /**
+     * Directory where CSV files are stored.
+     * @var string
+     */
     private $dir;
 
+    /**
+     * Constructor to initialize the CsvReader with a directory.
+     *
+     * @param string $directory The directory containing CSV files.
+     */
     public function __construct($directory) {
         $this->dir = rtrim($directory, '/') . '/';
     }
 
+    /**
+     * Reads all CSV files in the specified directory and returns an array of rows.
+     *
+     * @return array An array of associative arrays representing the rows in the CSV files.
+     */
     public function readAll() {
         $rows = array();
 

+ 8 - 1
api/chatbot.php

@@ -11,6 +11,12 @@ if ($_SERVER['REQUEST_METHOD'] !== 'POST') {
     exit;
 }
 
+
+/** Extract text from a PDF file using pdftotext command line tool.
+ *
+ * @param string $file Path to the PDF file.
+ * @return string Extracted text from the PDF.
+ */
 function extractPdfText($file) {
     $tmp = tempnam(sys_get_temp_dir(), 'pdftext');
     exec("pdftotext " . escapeshellarg($file) . " " . escapeshellarg($tmp));
@@ -19,7 +25,7 @@ function extractPdfText($file) {
     return $text ?: '';
 }
 
-// ### Build prompt for deepseek
+// Build prompt for deepseek
 $input = json_decode(file_get_contents('php://input'), true);
 $question = isset($input['question']) ? $input['question'] : '';
 $roleFile = __DIR__ . '/data/role.txt';
@@ -31,6 +37,7 @@ $contextText = "";
 foreach ($rows as $row) {
     $contextText .= "- " . implode(" | ", $row) . "\n";
 }
+
 $pdf1 = extractPdfText(__DIR__ . '/data/pdf/onboarding.pdf');
 $pdf2 = extractPdfText(__DIR__ . '/data/pdf/urlaub.pdf');
 

+ 0 - 40
scraper/getEntries.js

@@ -1,40 +0,0 @@
-const fs = require('fs');
-const path = require('path');
-
-const IN_FILE = 'knowledgebase.html';
-const OUT_DIR = './entries';
-
-// read file
-const html = fs.readFileSync(IN_FILE, 'utf8');
-
-// MAGIC
-const entryRegex = /<tr class="cb_pointer">([\s\S]*?<span name="(\d+)"[\s\S]*?<\/td>)[\s\S]*?<\/tr>/g;
-const entries = {};
-let match;
-
-while ((match = entryRegex.exec(html)) !== null) {
-    const id = match[2];
-    const block = match[1].trim();
-    entries[id] = block;
-}
-
-if (!fs.existsSync(OUT_DIR)) fs.mkdirSync(OUT_DIR);
-
-function decodeHtmlEntities(str) {
-    return str
-        // .replace(/&lt;/g, "<")
-        // .replace(/&gt;/g, ">")
-        // .replace(/&quot;/g, "\"")
-        // .replace(/&amp;/g, "&")
-        // .replace(/<(?!br\s*\/?)[^>]+>/gi, '');
-}
-
-for (const [id, content] of Object.entries(entries)) {
-    const filePath = path.join(OUT_DIR, `${id}.txt`);
-    // fs.writeFileSync(filePath, content, 'utf8');
-    // console.log(`📄 Gespeichert: ${filePath}`);
-}
-
-console.log(decodeHtmlEntities(entries[25]).trim());
-
-console.log(`✅ ${Object.keys(entries).length} Einträge gespeichert.`);