dir = rtrim($directory, '/') . '/'; } /** * Reads all PDF files in the specified directory and returns an array of extracted texts. * * @return array An array of associative arrays with filename and extracted text. */ public function readAll() { $rows = array(); foreach (glob($this->dir . '*.pdf') as $file) { $text = $this->readFile($file); $rows[] = [ 'filename' => basename($file), 'text' => $text ]; } return $rows; } /** * Extract text from a PDF file using pdftotext command line tool. * * @param string $file Path to the PDF file (relative or absolute). * @return string Extracted text from the PDF. */ public function readFile($file) { $filePath = strpos($file, $this->dir) === 0 ? $file : $this->dir . ltrim($file, '/'); $tmp = tempnam(sys_get_temp_dir(), 'pdftext'); exec("pdftotext " . escapeshellarg($filePath) . " " . escapeshellarg($tmp)); $text = file_get_contents($tmp); unlink($tmp); return $text ?: ''; } /** * Converts an array of PDF rows into a string format. * * @param array $pdfRows An array of associative arrays representing the PDF rows. * @return string A string representation of the PDF rows. */ public function toString($pdfRows) { $pdfString = ''; foreach ($pdfRows as $pdfRow) { $pdfString .= "- " . $pdfRow['filename'] . ":\n" . trim($pdfRow['text']) . "\n\n"; } return $pdfString; } }