PdfParser.php 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. <?php
  2. /**
  3. * This file is part of FPDI
  4. *
  5. * @package Fpdi
  6. * @copyright Copyright (c) 2020 Setasign GmbH & Co. KG (https://www.setasign.com)
  7. * @license http://opensource.org/licenses/mit-license The MIT License
  8. */
  9. namespace Fpdi\PdfParser;
  10. use Fpdi\PdfParser\CrossReference\CrossReference;
  11. use Fpdi\PdfParser\CrossReference\CrossReferenceException;
  12. use Fpdi\PdfParser\Type\PdfArray;
  13. use Fpdi\PdfParser\Type\PdfBoolean;
  14. use Fpdi\PdfParser\Type\PdfDictionary;
  15. use Fpdi\PdfParser\Type\PdfHexString;
  16. use Fpdi\PdfParser\Type\PdfIndirectObject;
  17. use Fpdi\PdfParser\Type\PdfIndirectObjectReference;
  18. use Fpdi\PdfParser\Type\PdfName;
  19. use Fpdi\PdfParser\Type\PdfNull;
  20. use Fpdi\PdfParser\Type\PdfNumeric;
  21. use Fpdi\PdfParser\Type\PdfStream;
  22. use Fpdi\PdfParser\Type\PdfString;
  23. use Fpdi\PdfParser\Type\PdfToken;
  24. use Fpdi\PdfParser\Type\PdfType;
  25. /**
  26. * A PDF parser class
  27. */
  28. class PdfParser
  29. {
  30. /**
  31. * @var StreamReader
  32. */
  33. protected $streamReader;
  34. /**
  35. * @var Tokenizer
  36. */
  37. protected $tokenizer;
  38. /**
  39. * The file header.
  40. *
  41. * @var string
  42. */
  43. protected $fileHeader;
  44. /**
  45. * The offset to the file header.
  46. *
  47. * @var int
  48. */
  49. protected $fileHeaderOffset;
  50. /**
  51. * @var CrossReference|null
  52. */
  53. protected $xref;
  54. /**
  55. * All read objects.
  56. *
  57. * @var array
  58. */
  59. protected $objects = [];
  60. /**
  61. * PdfParser constructor.
  62. *
  63. * @param StreamReader $streamReader
  64. */
  65. public function __construct(StreamReader $streamReader)
  66. {
  67. $this->streamReader = $streamReader;
  68. $this->tokenizer = new Tokenizer($streamReader);
  69. }
  70. /**
  71. * Removes cycled references.
  72. *
  73. * @internal
  74. */
  75. public function cleanUp()
  76. {
  77. $this->xref = null;
  78. }
  79. /**
  80. * Get the stream reader instance.
  81. *
  82. * @return StreamReader
  83. */
  84. public function getStreamReader()
  85. {
  86. return $this->streamReader;
  87. }
  88. /**
  89. * Get the tokenizer instance.
  90. *
  91. * @return Tokenizer
  92. */
  93. public function getTokenizer()
  94. {
  95. return $this->tokenizer;
  96. }
  97. /**
  98. * Resolves the file header.
  99. *
  100. * @throws PdfParserException
  101. * @return int
  102. */
  103. protected function resolveFileHeader()
  104. {
  105. if ($this->fileHeader) {
  106. return $this->fileHeaderOffset;
  107. }
  108. $this->streamReader->reset(0);
  109. $maxIterations = 1000;
  110. while (true) {
  111. $buffer = $this->streamReader->getBuffer(false);
  112. $offset = \strpos($buffer, '%PDF-');
  113. if ($offset === false) {
  114. if (!$this->streamReader->increaseLength(100) || (--$maxIterations === 0)) {
  115. throw new PdfParserException(
  116. 'Unable to find PDF file header.',
  117. PdfParserException::FILE_HEADER_NOT_FOUND
  118. );
  119. }
  120. continue;
  121. }
  122. break;
  123. }
  124. $this->fileHeaderOffset = $offset;
  125. $this->streamReader->setOffset($offset);
  126. $this->fileHeader = \trim($this->streamReader->readLine());
  127. return $this->fileHeaderOffset;
  128. }
  129. /**
  130. * Get the cross reference instance.
  131. *
  132. * @return CrossReference
  133. * @throws CrossReferenceException
  134. * @throws PdfParserException
  135. */
  136. public function getCrossReference()
  137. {
  138. if ($this->xref === null) {
  139. $this->xref = new CrossReference($this, $this->resolveFileHeader());
  140. }
  141. return $this->xref;
  142. }
  143. /**
  144. * Get the PDF version.
  145. *
  146. * @return int[] An array of major and minor version.
  147. * @throws PdfParserException
  148. */
  149. public function getPdfVersion()
  150. {
  151. $this->resolveFileHeader();
  152. if (\preg_match('/%PDF-(\d)\.(\d)/', $this->fileHeader, $result) === 0) {
  153. throw new PdfParserException(
  154. 'Unable to extract PDF version from file header.',
  155. PdfParserException::PDF_VERSION_NOT_FOUND
  156. );
  157. }
  158. list(, $major, $minor) = $result;
  159. $catalog = $this->getCatalog();
  160. if (isset($catalog->value['Version'])) {
  161. $versionParts = \explode(
  162. '.',
  163. PdfName::unescape(PdfType::resolve($catalog->value['Version'], $this)->value)
  164. );
  165. if (count($versionParts) === 2) {
  166. list($major, $minor) = $versionParts;
  167. }
  168. }
  169. return [(int) $major, (int) $minor];
  170. }
  171. /**
  172. * Get the catalog dictionary.
  173. *
  174. * @return PdfDictionary
  175. * @throws Type\PdfTypeException
  176. * @throws CrossReferenceException
  177. * @throws PdfParserException
  178. */
  179. public function getCatalog()
  180. {
  181. $trailer = $this->getCrossReference()->getTrailer();
  182. $catalog = PdfType::resolve(PdfDictionary::get($trailer, 'Root'), $this);
  183. return PdfDictionary::ensure($catalog);
  184. }
  185. /**
  186. * Get an indirect object by its object number.
  187. *
  188. * @param int $objectNumber
  189. * @param bool $cache
  190. * @return PdfIndirectObject
  191. * @throws CrossReferenceException
  192. * @throws PdfParserException
  193. */
  194. public function getIndirectObject($objectNumber, $cache = false)
  195. {
  196. $objectNumber = (int) $objectNumber;
  197. if (isset($this->objects[$objectNumber])) {
  198. return $this->objects[$objectNumber];
  199. }
  200. $object = $this->getCrossReference()->getIndirectObject($objectNumber);
  201. if ($cache) {
  202. $this->objects[$objectNumber] = $object;
  203. }
  204. return $object;
  205. }
  206. /**
  207. * Read a PDF value.
  208. *
  209. * @param null|bool|string $token
  210. * @param null|string $expectedType
  211. * @return false|PdfArray|PdfBoolean|PdfDictionary|PdfHexString|PdfIndirectObject|PdfIndirectObjectReference|PdfName|PdfNull|PdfNumeric|PdfStream|PdfString|PdfToken
  212. * @throws Type\PdfTypeException
  213. */
  214. public function readValue($token = null, $expectedType = null)
  215. {
  216. if ($token === null) {
  217. $token = $this->tokenizer->getNextToken();
  218. }
  219. if ($token === false) {
  220. if ($expectedType !== null) {
  221. throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
  222. }
  223. return false;
  224. }
  225. switch ($token) {
  226. case '(':
  227. $this->ensureExpectedType($token, $expectedType);
  228. return PdfString::parse($this->streamReader);
  229. case '<':
  230. if ($this->streamReader->getByte() === '<') {
  231. $this->ensureExpectedType('<<', $expectedType);
  232. $this->streamReader->addOffset(1);
  233. return PdfDictionary::parse($this->tokenizer, $this->streamReader, $this);
  234. }
  235. $this->ensureExpectedType($token, $expectedType);
  236. return PdfHexString::parse($this->streamReader);
  237. case '/':
  238. $this->ensureExpectedType($token, $expectedType);
  239. return PdfName::parse($this->tokenizer, $this->streamReader);
  240. case '[':
  241. $this->ensureExpectedType($token, $expectedType);
  242. return PdfArray::parse($this->tokenizer, $this);
  243. default:
  244. if (\is_numeric($token)) {
  245. if (($token2 = $this->tokenizer->getNextToken()) !== false) {
  246. if (\is_numeric($token2) && ($token3 = $this->tokenizer->getNextToken()) !== false) {
  247. switch ($token3) {
  248. case 'obj':
  249. if ($expectedType !== null && $expectedType !== PdfIndirectObject::class) {
  250. throw new Type\PdfTypeException(
  251. 'Got unexpected token type.',
  252. Type\PdfTypeException::INVALID_DATA_TYPE
  253. );
  254. }
  255. return PdfIndirectObject::parse(
  256. (int) $token,
  257. (int) $token2,
  258. $this,
  259. $this->tokenizer,
  260. $this->streamReader
  261. );
  262. case 'R':
  263. if (
  264. $expectedType !== null &&
  265. $expectedType !== PdfIndirectObjectReference::class
  266. ) {
  267. throw new Type\PdfTypeException(
  268. 'Got unexpected token type.',
  269. Type\PdfTypeException::INVALID_DATA_TYPE
  270. );
  271. }
  272. return PdfIndirectObjectReference::create((int) $token, (int) $token2);
  273. }
  274. $this->tokenizer->pushStack($token3);
  275. }
  276. $this->tokenizer->pushStack($token2);
  277. }
  278. if ($expectedType !== null && $expectedType !== PdfNumeric::class) {
  279. throw new Type\PdfTypeException(
  280. 'Got unexpected token type.',
  281. Type\PdfTypeException::INVALID_DATA_TYPE
  282. );
  283. }
  284. return PdfNumeric::create($token + 0);
  285. }
  286. if ($token === 'true' || $token === 'false') {
  287. $this->ensureExpectedType($token, $expectedType);
  288. return PdfBoolean::create($token === 'true');
  289. }
  290. if ($token === 'null') {
  291. $this->ensureExpectedType($token, $expectedType);
  292. return new PdfNull();
  293. }
  294. if ($expectedType !== null && $expectedType !== PdfToken::class) {
  295. throw new Type\PdfTypeException(
  296. 'Got unexpected token type.',
  297. Type\PdfTypeException::INVALID_DATA_TYPE
  298. );
  299. }
  300. $v = new PdfToken();
  301. $v->value = $token;
  302. return $v;
  303. }
  304. }
  305. /**
  306. * Ensures that the token will evaluate to an expected object type (or not).
  307. *
  308. * @param string $token
  309. * @param string|null $expectedType
  310. * @return bool
  311. * @throws Type\PdfTypeException
  312. */
  313. private function ensureExpectedType($token, $expectedType)
  314. {
  315. static $mapping = [
  316. '(' => PdfString::class,
  317. '<' => PdfHexString::class,
  318. '<<' => PdfDictionary::class,
  319. '/' => PdfName::class,
  320. '[' => PdfArray::class,
  321. 'true' => PdfBoolean::class,
  322. 'false' => PdfBoolean::class,
  323. 'null' => PdfNull::class
  324. ];
  325. if ($expectedType === null || $mapping[$token] === $expectedType) {
  326. return true;
  327. }
  328. throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
  329. }
  330. }