PDFReader.php 2.1 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. <?php
  2. /**
  3. * PDFReader class to read PDF files from a specified directory.
  4. * It reads all PDF files, extracts their text, and returns an array of rows (one per file).
  5. */
  6. class PDFReader {
  7. /**
  8. * Directory where PDF files are stored.
  9. * @var string
  10. */
  11. private $dir;
  12. /**
  13. * Constructor to initialize the PDFReader with a directory.
  14. *
  15. * @param string $directory The directory containing PDF files.
  16. */
  17. public function __construct($directory) {
  18. $this->dir = rtrim($directory, '/') . '/';
  19. }
  20. /**
  21. * Reads all PDF files in the specified directory and returns an array of extracted texts.
  22. *
  23. * @return array An array of associative arrays with filename and extracted text.
  24. */
  25. public function readAll() {
  26. $rows = array();
  27. foreach (glob($this->dir . '*.pdf') as $file) {
  28. $text = $this->readFile($file);
  29. $rows[] = [
  30. 'filename' => basename($file),
  31. 'text' => $text
  32. ];
  33. }
  34. return $rows;
  35. }
  36. /**
  37. * Extract text from a PDF file using pdftotext command line tool.
  38. *
  39. * @param string $file Path to the PDF file (relative or absolute).
  40. * @return string Extracted text from the PDF.
  41. */
  42. public function readFile($file) {
  43. $filePath = strpos($file, $this->dir) === 0 ? $file : $this->dir . ltrim($file, '/');
  44. $tmp = tempnam(sys_get_temp_dir(), 'pdftext');
  45. exec("pdftotext " . escapeshellarg($filePath) . " " . escapeshellarg($tmp));
  46. $text = file_get_contents($tmp);
  47. unlink($tmp);
  48. return $text ?: '';
  49. }
  50. /**
  51. * Converts an array of PDF rows into a string format.
  52. *
  53. * @param array $pdfRows An array of associative arrays representing the PDF rows.
  54. * @return string A string representation of the PDF rows.
  55. */
  56. public function toString($pdfRows) {
  57. $pdfString = '';
  58. foreach ($pdfRows as $pdfRow) {
  59. $pdfString .= "- " . $pdfRow['filename'] . ":\n" . trim($pdfRow['text']) . "\n\n";
  60. }
  61. return $pdfString;
  62. }
  63. }