PdfStream.php 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. <?php
  2. /**
  3. * This file is part of FPDI
  4. *
  5. * @package Fpdi
  6. * @copyright Copyright (c) 2020 Setasign GmbH & Co. KG (https://www.setasign.com)
  7. * @license http://opensource.org/licenses/mit-license The MIT License
  8. */
  9. namespace Fpdi\PdfParser\Type;
  10. use Fpdi\PdfParser\CrossReference\CrossReferenceException;
  11. use Fpdi\PdfParser\Filter\Ascii85;
  12. use Fpdi\PdfParser\Filter\AsciiHex;
  13. use Fpdi\PdfParser\Filter\FilterException;
  14. use Fpdi\PdfParser\Filter\Flate;
  15. use Fpdi\PdfParser\Filter\Lzw;
  16. use Fpdi\PdfParser\PdfParser;
  17. use Fpdi\PdfParser\PdfParserException;
  18. use Fpdi\PdfParser\StreamReader;
  19. use FpdiPdfParser\PdfParser\Filter\Predictor;
  20. /**
  21. * Class representing a PDF stream object
  22. */
  23. class PdfStream extends PdfType
  24. {
  25. /**
  26. * Parses a stream from a stream reader.
  27. *
  28. * @param PdfDictionary $dictionary
  29. * @param StreamReader $reader
  30. * @param PdfParser $parser Optional to keep backwards compatibility
  31. * @return self
  32. * @throws PdfTypeException
  33. */
  34. public static function parse(PdfDictionary $dictionary, StreamReader $reader, PdfParser $parser = null)
  35. {
  36. $v = new self();
  37. $v->value = $dictionary;
  38. $v->reader = $reader;
  39. $v->parser = $parser;
  40. $offset = $reader->getOffset();
  41. // Find the first "newline"
  42. while (($firstByte = $reader->getByte($offset)) !== false) {
  43. if ($firstByte !== "\n" && $firstByte !== "\r") {
  44. $offset++;
  45. } else {
  46. break;
  47. }
  48. }
  49. if ($firstByte === false) {
  50. throw new PdfTypeException(
  51. 'Unable to parse stream data. No newline after the stream keyword found.',
  52. PdfTypeException::NO_NEWLINE_AFTER_STREAM_KEYWORD
  53. );
  54. }
  55. $sndByte = $reader->getByte($offset + 1);
  56. if ($firstByte === "\n" || $firstByte === "\r") {
  57. $offset++;
  58. }
  59. if ($sndByte === "\n" && $firstByte !== "\n") {
  60. $offset++;
  61. }
  62. $reader->setOffset($offset);
  63. // let's only save the byte-offset and read the stream only when needed
  64. $v->stream = $reader->getPosition() + $reader->getOffset();
  65. return $v;
  66. }
  67. /**
  68. * Helper method to create an instance.
  69. *
  70. * @param PdfDictionary $dictionary
  71. * @param string $stream
  72. * @return self
  73. */
  74. public static function create(PdfDictionary $dictionary, $stream)
  75. {
  76. $v = new self();
  77. $v->value = $dictionary;
  78. $v->stream = (string) $stream;
  79. return $v;
  80. }
  81. /**
  82. * Ensures that the passed value is a PdfStream instance.
  83. *
  84. * @param mixed $stream
  85. * @return self
  86. * @throws PdfTypeException
  87. */
  88. public static function ensure($stream)
  89. {
  90. return PdfType::ensureType(self::class, $stream, 'Stream value expected.');
  91. }
  92. /**
  93. * The stream or its byte-offset position.
  94. *
  95. * @var int|string
  96. */
  97. protected $stream;
  98. /**
  99. * The stream reader instance.
  100. *
  101. * @var StreamReader|null
  102. */
  103. protected $reader;
  104. /**
  105. * The PDF parser instance.
  106. *
  107. * @var PdfParser
  108. */
  109. protected $parser;
  110. /**
  111. * Get the stream data.
  112. *
  113. * @param bool $cache Whether cache the stream data or not.
  114. * @return bool|string
  115. * @throws PdfTypeException
  116. * @throws CrossReferenceException
  117. * @throws PdfParserException
  118. */
  119. public function getStream($cache = false)
  120. {
  121. if (\is_int($this->stream)) {
  122. $length = PdfDictionary::get($this->value, 'Length');
  123. if ($this->parser !== null) {
  124. $length = PdfType::resolve($length, $this->parser);
  125. }
  126. if (!($length instanceof PdfNumeric) || $length->value === 0) {
  127. $this->reader->reset($this->stream, 100000);
  128. $buffer = $this->extractStream();
  129. } else {
  130. $this->reader->reset($this->stream, $length->value);
  131. $buffer = $this->reader->getBuffer(false);
  132. if ($this->parser !== null) {
  133. $this->reader->reset($this->stream + strlen($buffer));
  134. $this->parser->getTokenizer()->clearStack();
  135. $token = $this->parser->readValue();
  136. if ($token === false || !($token instanceof PdfToken) || $token->value !== 'endstream') {
  137. $this->reader->reset($this->stream, 100000);
  138. $buffer = $this->extractStream();
  139. $this->reader->reset($this->stream + strlen($buffer));
  140. }
  141. }
  142. }
  143. if ($cache === false) {
  144. return $buffer;
  145. }
  146. $this->stream = $buffer;
  147. $this->reader = null;
  148. }
  149. return $this->stream;
  150. }
  151. /**
  152. * Extract the stream "manually".
  153. *
  154. * @return string
  155. * @throws PdfTypeException
  156. */
  157. protected function extractStream()
  158. {
  159. while (true) {
  160. $buffer = $this->reader->getBuffer(false);
  161. $length = \strpos($buffer, 'endstream');
  162. if ($length === false) {
  163. if (!$this->reader->increaseLength(100000)) {
  164. throw new PdfTypeException('Cannot extract stream.');
  165. }
  166. continue;
  167. }
  168. break;
  169. }
  170. $buffer = \substr($buffer, 0, $length);
  171. $lastByte = \substr($buffer, -1);
  172. /* Check for EOL marker =
  173. * CARRIAGE RETURN (\r) and a LINE FEED (\n) or just a LINE FEED (\n},
  174. * and not by a CARRIAGE RETURN (\r) alone
  175. */
  176. if ($lastByte === "\n") {
  177. $buffer = \substr($buffer, 0, -1);
  178. $lastByte = \substr($buffer, -1);
  179. if ($lastByte === "\r") {
  180. $buffer = \substr($buffer, 0, -1);
  181. }
  182. }
  183. // There are streams in the wild, which have only white signs in them but need to be parsed manually due
  184. // to a problem encountered before (e.g. Length === 0). We should set them to empty streams to avoid problems
  185. // in further processing (e.g. applying of filters).
  186. if (trim($buffer) === '') {
  187. $buffer = '';
  188. }
  189. return $buffer;
  190. }
  191. /**
  192. * Get the unfiltered stream data.
  193. *
  194. * @return string
  195. * @throws FilterException
  196. * @throws PdfParserException
  197. */
  198. public function getUnfilteredStream()
  199. {
  200. $stream = $this->getStream();
  201. $filters = PdfDictionary::get($this->value, 'Filter');
  202. if ($filters instanceof PdfNull) {
  203. return $stream;
  204. }
  205. if ($filters instanceof PdfArray) {
  206. $filters = $filters->value;
  207. } else {
  208. $filters = [$filters];
  209. }
  210. $decodeParams = PdfDictionary::get($this->value, 'DecodeParms');
  211. if ($decodeParams instanceof PdfArray) {
  212. $decodeParams = $decodeParams->value;
  213. } else {
  214. $decodeParams = [$decodeParams];
  215. }
  216. foreach ($filters as $key => $filter) {
  217. if (!($filter instanceof PdfName)) {
  218. continue;
  219. }
  220. $decodeParam = null;
  221. if (isset($decodeParams[$key])) {
  222. $decodeParam = ($decodeParams[$key] instanceof PdfDictionary ? $decodeParams[$key] : null);
  223. }
  224. switch ($filter->value) {
  225. case 'FlateDecode':
  226. case 'Fl':
  227. case 'LZWDecode':
  228. case 'LZW':
  229. if (\strpos($filter->value, 'LZW') === 0) {
  230. $filterObject = new Lzw();
  231. } else {
  232. $filterObject = new Flate();
  233. }
  234. $stream = $filterObject->decode($stream);
  235. if ($decodeParam instanceof PdfDictionary) {
  236. $predictor = PdfDictionary::get($decodeParam, 'Predictor', PdfNumeric::create(1));
  237. if ($predictor->value !== 1) {
  238. if (!\class_exists(Predictor::class)) {
  239. throw new PdfParserException(
  240. 'This PDF document makes use of features which are only implemented in the ' .
  241. 'commercial "FPDI PDF-Parser" add-on (see https://www.setasign.com/fpdi-pdf-' .
  242. 'parser).',
  243. PdfParserException::IMPLEMENTED_IN_FPDI_PDF_PARSER
  244. );
  245. }
  246. $colors = PdfDictionary::get($decodeParam, 'Colors', PdfNumeric::create(1));
  247. $bitsPerComponent = PdfDictionary::get(
  248. $decodeParam,
  249. 'BitsPerComponent',
  250. PdfNumeric::create(8)
  251. );
  252. $columns = PdfDictionary::get($decodeParam, 'Columns', PdfNumeric::create(1));
  253. $filterObject = new Predictor(
  254. $predictor->value,
  255. $colors->value,
  256. $bitsPerComponent->value,
  257. $columns->value
  258. );
  259. $stream = $filterObject->decode($stream);
  260. }
  261. }
  262. break;
  263. case 'ASCII85Decode':
  264. case 'A85':
  265. $filterObject = new Ascii85();
  266. $stream = $filterObject->decode($stream);
  267. break;
  268. case 'ASCIIHexDecode':
  269. case 'AHx':
  270. $filterObject = new AsciiHex();
  271. $stream = $filterObject->decode($stream);
  272. break;
  273. default:
  274. throw new FilterException(
  275. \sprintf('Unsupported filter "%s".', $filter->value),
  276. FilterException::UNSUPPORTED_FILTER
  277. );
  278. }
  279. }
  280. return $stream;
  281. }
  282. }