| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- <?php
- /**
- * PDFReader class to read PDF files from a specified directory.
- * It reads all PDF files, extracts their text, and returns an array of rows (one per file).
- */
- class PDFReader {
- /**
- * Directory where PDF files are stored.
- * @var string
- */
- private $dir;
- /**
- * Constructor to initialize the PDFReader with a directory.
- *
- * @param string $directory The directory containing PDF files.
- */
- public function __construct($directory) {
- $this->dir = rtrim($directory, '/') . '/';
- }
- /**
- * Reads all PDF files in the specified directory and returns an array of extracted texts.
- *
- * @return array An array of associative arrays with filename and extracted text.
- */
- public function readAll() {
- $rows = array();
- foreach (glob($this->dir . '*.pdf') as $file) {
- $text = $this->readFile($file);
- $rows[] = [
- 'filename' => basename($file),
- 'text' => $text
- ];
- }
- return $rows;
- }
- /**
- * Extract text from a PDF file using pdftotext command line tool.
- *
- * @param string $file Path to the PDF file (relative or absolute).
- * @return string Extracted text from the PDF.
- */
- public function readFile($file) {
- $filePath = strpos($file, $this->dir) === 0 ? $file : $this->dir . ltrim($file, '/');
- $tmp = tempnam(sys_get_temp_dir(), 'pdftext');
- exec("pdftotext " . escapeshellarg($filePath) . " " . escapeshellarg($tmp));
- $text = file_get_contents($tmp);
- unlink($tmp);
- return $text ?: '';
- }
- /**
- * Converts an array of PDF rows into a string format.
- *
- * @param array $pdfRows An array of associative arrays representing the PDF rows.
- * @return string A string representation of the PDF rows.
- */
- public function toString($pdfRows) {
- $pdfString = '';
- foreach ($pdfRows as $pdfRow) {
- $pdfString .= "- " . $pdfRow['filename'] . ":\n" . trim($pdfRow['text']) . "\n\n";
- }
- return $pdfString;
- }
- }
|