Browse Source

DeepSeek-llm:chat

daniel 7 months ago
parent
commit
380eb5d160
4 changed files with 357 additions and 9 deletions
  1. 38 7
      api/chatbot.php
  2. 141 0
      api/data/html/knowledgebase.html
  3. 37 2
      scraper/getEntries.js
  4. 141 0
      scraper/knowledgebase.html

+ 38 - 7
api/chatbot.php

@@ -1,5 +1,6 @@
 <?php
 <?php
 require_once 'CsvReader.php';
 require_once 'CsvReader.php';
+set_time_limit(2500); // 2 Minuten erlauben
 
 
 header('Content-Type: application/json');
 header('Content-Type: application/json');
 
 
@@ -35,19 +36,45 @@ $pdf2 = extractPdfText(__DIR__ . '/data/pdf/urlaub.pdf');
 
 
 $pdfContext = "\n\nAus Onboarding-Dokument:\n" . $pdf1 . "\n\nAus Urlaubsdokument:\n" . $pdf2;
 $pdfContext = "\n\nAus Onboarding-Dokument:\n" . $pdf1 . "\n\nAus Urlaubsdokument:\n" . $pdf2;
 
 
+$htmlDir = __DIR__ . '/data/html';
+$htmlContext = '';
+
+$prompt = "";
 $prompt = "Deine Rolle ist: ". $role;
 $prompt = "Deine Rolle ist: ". $role;
 $prompt .= "\nDeine Frage ist: " . $question;
 $prompt .= "\nDeine Frage ist: " . $question;
 $prompt .= "\nCSV-Daten:\n" . $contextText;
 $prompt .= "\nCSV-Daten:\n" . $contextText;
 $prompt .= "\nPDF-Kontext:\n" . $pdfContext;
 $prompt .= "\nPDF-Kontext:\n" . $pdfContext;
+$prompt .= "\nHTML-Wissensdatenbank:\n" . $htmlContext;
+
+// ollama stuff
+//$payload = json_encode([
+//    "model" => "deepseek-r1",
+////    "model" => "deepseek-llm",
+////    "model" => "deepseek-coder:6.7b",
+//    "prompt" => $prompt,
+//    "stream" => false,
+//    "stop" => ["</think>"]
+//]);
+//$promptContext = "Kontextdaten:\n\nCSV:\n$contextText\n\nPDF:\n$pdfContext\nHTML:\n$htmlContext";
+$promptContext = "Kontextdaten:\n\nCSV:\n$contextText\n\nPDF:\n$pdfContext";
+error_log("System prompt length: " . strlen($promptContext));
 
 
 $payload = json_encode([
 $payload = json_encode([
-    "model" => "deepseek-r1",
-    "prompt" => $prompt,
+    "model" => "deepseek-llm-7b-chat",
+    "messages" => [
+//        ["role" => "system", "content" => "Rolle:\n$role\n\n$promptContext"],
+        ["role" => "system", "content" => "Rolle:\n$role"],
+        ["role" => "user", "content" => $question]
+//        ["role" => "context", "content" => $promptContext]
+    ],
+    "temperature" => 0.5,
+    "max_tokens" => -1,
     "stream" => false
     "stream" => false
 ]);
 ]);
-
+$start = microtime(true);
 // ### Send Request
 // ### Send Request
-$ch = curl_init('http://localhost:11434/api/generate');
+//$ch = curl_init('http://localhost:11434/api/generate');
+$ch = curl_init('http://localhost:1234/api/v0/chat/completions');
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
 curl_setopt($ch, CURLOPT_POST, true);
 curl_setopt($ch, CURLOPT_POST, true);
 curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
 curl_setopt($ch, CURLOPT_HTTPHEADER, ['Content-Type: application/json']);
@@ -55,13 +82,17 @@ curl_setopt($ch, CURLOPT_POSTFIELDS, $payload);
 $response = curl_exec($ch);
 $response = curl_exec($ch);
 curl_close($ch);
 curl_close($ch);
 $data = json_decode($response, true);
 $data = json_decode($response, true);
+$end = microtime(true);
 
 
 // ### Prepare Response
 // ### Prepare Response
-$responseText = isset($data['response']) ? $data['response'] : 'No answer from Deepseek!';
-$responseText = preg_replace('/<think>.*?<\/think>/s', ' ', $responseText);
+//$responseText = isset($data['response']) ? $data['response'] : 'No answer from Deepseek!';
+//$responseText = $data['choices'][0]['message']['content'] ?? 'No answer from Deepseek!';
+$responseText = isset($data['choices'][0]['message']['content']) ? $data['choices'][0]['message']['content'] : '';
+//$responseText = preg_replace('/<think>.*?<\/think>/s', ' ', $responseText);
 $responseText = trim($responseText);
 $responseText = trim($responseText);
 
 
 echo json_encode([
 echo json_encode([
     'reply' => $responseText,
     'reply' => $responseText,
-    'misc' => $pdfContext
+//    'misc' => $htmlContext,
+    'duration_seconds' => round($end - $start, 3)
 ]);
 ]);

File diff suppressed because it is too large
+ 141 - 0
api/data/html/knowledgebase.html


+ 37 - 2
scraper/getEntries.js

@@ -1,5 +1,40 @@
-const puppeteer = require('puppeteer');
 const fs = require('fs');
 const fs = require('fs');
 const path = require('path');
 const path = require('path');
 
 
-const BASE_URL = 'https://projects.cybob.com/ajax/&am=Knowledgebase.showEntryList&search=&tags=undefined';
+const IN_FILE = 'knowledgebase.html';
+const OUT_DIR = './entries';
+
+// read file
+const html = fs.readFileSync(IN_FILE, 'utf8');
+
+// MAGIC
+const entryRegex = /<tr class="cb_pointer">([\s\S]*?<span name="(\d+)"[\s\S]*?<\/td>)[\s\S]*?<\/tr>/g;
+const entries = {};
+let match;
+
+while ((match = entryRegex.exec(html)) !== null) {
+    const id = match[2];
+    const block = match[1].trim();
+    entries[id] = block;
+}
+
+if (!fs.existsSync(OUT_DIR)) fs.mkdirSync(OUT_DIR);
+
+function decodeHtmlEntities(str) {
+    return str
+        // .replace(/&lt;/g, "<")
+        // .replace(/&gt;/g, ">")
+        // .replace(/&quot;/g, "\"")
+        // .replace(/&amp;/g, "&")
+        // .replace(/<(?!br\s*\/?)[^>]+>/gi, '');
+}
+
+for (const [id, content] of Object.entries(entries)) {
+    const filePath = path.join(OUT_DIR, `${id}.txt`);
+    // fs.writeFileSync(filePath, content, 'utf8');
+    // console.log(`📄 Gespeichert: ${filePath}`);
+}
+
+console.log(decodeHtmlEntities(entries[25]).trim());
+
+console.log(`✅ ${Object.keys(entries).length} Einträge gespeichert.`);

File diff suppressed because it is too large
+ 141 - 0
scraper/knowledgebase.html


Some files were not shown because too many files changed in this diff