382 lines
11 KiB
PHP
382 lines
11 KiB
PHP
<?php
|
|
|
|
/**
|
|
* This file is part of FPDI
|
|
*
|
|
* @package Fpdi
|
|
* @copyright Copyright (c) 2020 Setasign GmbH & Co. KG (https://www.setasign.com)
|
|
* @license http://opensource.org/licenses/mit-license The MIT License
|
|
*/
|
|
|
|
namespace Fpdi\PdfParser;
|
|
|
|
use Fpdi\PdfParser\CrossReference\CrossReference;
|
|
use Fpdi\PdfParser\CrossReference\CrossReferenceException;
|
|
use Fpdi\PdfParser\Type\PdfArray;
|
|
use Fpdi\PdfParser\Type\PdfBoolean;
|
|
use Fpdi\PdfParser\Type\PdfDictionary;
|
|
use Fpdi\PdfParser\Type\PdfHexString;
|
|
use Fpdi\PdfParser\Type\PdfIndirectObject;
|
|
use Fpdi\PdfParser\Type\PdfIndirectObjectReference;
|
|
use Fpdi\PdfParser\Type\PdfName;
|
|
use Fpdi\PdfParser\Type\PdfNull;
|
|
use Fpdi\PdfParser\Type\PdfNumeric;
|
|
use Fpdi\PdfParser\Type\PdfStream;
|
|
use Fpdi\PdfParser\Type\PdfString;
|
|
use Fpdi\PdfParser\Type\PdfToken;
|
|
use Fpdi\PdfParser\Type\PdfType;
|
|
|
|
/**
|
|
* A PDF parser class
|
|
*/
|
|
class PdfParser
|
|
{
|
|
/**
|
|
* @var StreamReader
|
|
*/
|
|
protected $streamReader;
|
|
|
|
/**
|
|
* @var Tokenizer
|
|
*/
|
|
protected $tokenizer;
|
|
|
|
/**
|
|
* The file header.
|
|
*
|
|
* @var string
|
|
*/
|
|
protected $fileHeader;
|
|
|
|
/**
|
|
* The offset to the file header.
|
|
*
|
|
* @var int
|
|
*/
|
|
protected $fileHeaderOffset;
|
|
|
|
/**
|
|
* @var CrossReference|null
|
|
*/
|
|
protected $xref;
|
|
|
|
/**
|
|
* All read objects.
|
|
*
|
|
* @var array
|
|
*/
|
|
protected $objects = [];
|
|
|
|
/**
|
|
* PdfParser constructor.
|
|
*
|
|
* @param StreamReader $streamReader
|
|
*/
|
|
public function __construct(StreamReader $streamReader)
|
|
{
|
|
$this->streamReader = $streamReader;
|
|
$this->tokenizer = new Tokenizer($streamReader);
|
|
}
|
|
|
|
/**
|
|
* Removes cycled references.
|
|
*
|
|
* @internal
|
|
*/
|
|
public function cleanUp()
|
|
{
|
|
$this->xref = null;
|
|
}
|
|
|
|
/**
|
|
* Get the stream reader instance.
|
|
*
|
|
* @return StreamReader
|
|
*/
|
|
public function getStreamReader()
|
|
{
|
|
return $this->streamReader;
|
|
}
|
|
|
|
/**
|
|
* Get the tokenizer instance.
|
|
*
|
|
* @return Tokenizer
|
|
*/
|
|
public function getTokenizer()
|
|
{
|
|
return $this->tokenizer;
|
|
}
|
|
|
|
/**
|
|
* Resolves the file header.
|
|
*
|
|
* @throws PdfParserException
|
|
* @return int
|
|
*/
|
|
protected function resolveFileHeader()
|
|
{
|
|
if ($this->fileHeader) {
|
|
return $this->fileHeaderOffset;
|
|
}
|
|
|
|
$this->streamReader->reset(0);
|
|
$maxIterations = 1000;
|
|
while (true) {
|
|
$buffer = $this->streamReader->getBuffer(false);
|
|
$offset = \strpos($buffer, '%PDF-');
|
|
if ($offset === false) {
|
|
if (!$this->streamReader->increaseLength(100) || (--$maxIterations === 0)) {
|
|
throw new PdfParserException(
|
|
'Unable to find PDF file header.',
|
|
PdfParserException::FILE_HEADER_NOT_FOUND
|
|
);
|
|
}
|
|
continue;
|
|
}
|
|
break;
|
|
}
|
|
|
|
$this->fileHeaderOffset = $offset;
|
|
$this->streamReader->setOffset($offset);
|
|
|
|
$this->fileHeader = \trim($this->streamReader->readLine());
|
|
return $this->fileHeaderOffset;
|
|
}
|
|
|
|
/**
|
|
* Get the cross reference instance.
|
|
*
|
|
* @return CrossReference
|
|
* @throws CrossReferenceException
|
|
* @throws PdfParserException
|
|
*/
|
|
public function getCrossReference()
|
|
{
|
|
if ($this->xref === null) {
|
|
$this->xref = new CrossReference($this, $this->resolveFileHeader());
|
|
}
|
|
|
|
return $this->xref;
|
|
}
|
|
|
|
/**
|
|
* Get the PDF version.
|
|
*
|
|
* @return int[] An array of major and minor version.
|
|
* @throws PdfParserException
|
|
*/
|
|
public function getPdfVersion()
|
|
{
|
|
$this->resolveFileHeader();
|
|
|
|
if (\preg_match('/%PDF-(\d)\.(\d)/', $this->fileHeader, $result) === 0) {
|
|
throw new PdfParserException(
|
|
'Unable to extract PDF version from file header.',
|
|
PdfParserException::PDF_VERSION_NOT_FOUND
|
|
);
|
|
}
|
|
list(, $major, $minor) = $result;
|
|
|
|
$catalog = $this->getCatalog();
|
|
if (isset($catalog->value['Version'])) {
|
|
$versionParts = \explode(
|
|
'.',
|
|
PdfName::unescape(PdfType::resolve($catalog->value['Version'], $this)->value)
|
|
);
|
|
if (count($versionParts) === 2) {
|
|
list($major, $minor) = $versionParts;
|
|
}
|
|
}
|
|
|
|
return [(int) $major, (int) $minor];
|
|
}
|
|
|
|
/**
|
|
* Get the catalog dictionary.
|
|
*
|
|
* @return PdfDictionary
|
|
* @throws Type\PdfTypeException
|
|
* @throws CrossReferenceException
|
|
* @throws PdfParserException
|
|
*/
|
|
public function getCatalog()
|
|
{
|
|
$trailer = $this->getCrossReference()->getTrailer();
|
|
|
|
$catalog = PdfType::resolve(PdfDictionary::get($trailer, 'Root'), $this);
|
|
|
|
return PdfDictionary::ensure($catalog);
|
|
}
|
|
|
|
/**
|
|
* Get an indirect object by its object number.
|
|
*
|
|
* @param int $objectNumber
|
|
* @param bool $cache
|
|
* @return PdfIndirectObject
|
|
* @throws CrossReferenceException
|
|
* @throws PdfParserException
|
|
*/
|
|
public function getIndirectObject($objectNumber, $cache = false)
|
|
{
|
|
$objectNumber = (int) $objectNumber;
|
|
if (isset($this->objects[$objectNumber])) {
|
|
return $this->objects[$objectNumber];
|
|
}
|
|
|
|
$object = $this->getCrossReference()->getIndirectObject($objectNumber);
|
|
|
|
if ($cache) {
|
|
$this->objects[$objectNumber] = $object;
|
|
}
|
|
|
|
return $object;
|
|
}
|
|
|
|
/**
|
|
* Read a PDF value.
|
|
*
|
|
* @param null|bool|string $token
|
|
* @param null|string $expectedType
|
|
* @return false|PdfArray|PdfBoolean|PdfDictionary|PdfHexString|PdfIndirectObject|PdfIndirectObjectReference|PdfName|PdfNull|PdfNumeric|PdfStream|PdfString|PdfToken
|
|
* @throws Type\PdfTypeException
|
|
*/
|
|
public function readValue($token = null, $expectedType = null)
|
|
{
|
|
if ($token === null) {
|
|
$token = $this->tokenizer->getNextToken();
|
|
}
|
|
|
|
if ($token === false) {
|
|
if ($expectedType !== null) {
|
|
throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
|
|
}
|
|
return false;
|
|
}
|
|
|
|
switch ($token) {
|
|
case '(':
|
|
$this->ensureExpectedType($token, $expectedType);
|
|
return PdfString::parse($this->streamReader);
|
|
|
|
case '<':
|
|
if ($this->streamReader->getByte() === '<') {
|
|
$this->ensureExpectedType('<<', $expectedType);
|
|
$this->streamReader->addOffset(1);
|
|
return PdfDictionary::parse($this->tokenizer, $this->streamReader, $this);
|
|
}
|
|
|
|
$this->ensureExpectedType($token, $expectedType);
|
|
return PdfHexString::parse($this->streamReader);
|
|
|
|
case '/':
|
|
$this->ensureExpectedType($token, $expectedType);
|
|
return PdfName::parse($this->tokenizer, $this->streamReader);
|
|
|
|
case '[':
|
|
$this->ensureExpectedType($token, $expectedType);
|
|
return PdfArray::parse($this->tokenizer, $this);
|
|
|
|
default:
|
|
if (\is_numeric($token)) {
|
|
if (($token2 = $this->tokenizer->getNextToken()) !== false) {
|
|
if (\is_numeric($token2) && ($token3 = $this->tokenizer->getNextToken()) !== false) {
|
|
switch ($token3) {
|
|
case 'obj':
|
|
if ($expectedType !== null && $expectedType !== PdfIndirectObject::class) {
|
|
throw new Type\PdfTypeException(
|
|
'Got unexpected token type.',
|
|
Type\PdfTypeException::INVALID_DATA_TYPE
|
|
);
|
|
}
|
|
|
|
return PdfIndirectObject::parse(
|
|
(int) $token,
|
|
(int) $token2,
|
|
$this,
|
|
$this->tokenizer,
|
|
$this->streamReader
|
|
);
|
|
case 'R':
|
|
if (
|
|
$expectedType !== null &&
|
|
$expectedType !== PdfIndirectObjectReference::class
|
|
) {
|
|
throw new Type\PdfTypeException(
|
|
'Got unexpected token type.',
|
|
Type\PdfTypeException::INVALID_DATA_TYPE
|
|
);
|
|
}
|
|
|
|
return PdfIndirectObjectReference::create((int) $token, (int) $token2);
|
|
}
|
|
|
|
$this->tokenizer->pushStack($token3);
|
|
}
|
|
|
|
$this->tokenizer->pushStack($token2);
|
|
}
|
|
|
|
if ($expectedType !== null && $expectedType !== PdfNumeric::class) {
|
|
throw new Type\PdfTypeException(
|
|
'Got unexpected token type.',
|
|
Type\PdfTypeException::INVALID_DATA_TYPE
|
|
);
|
|
}
|
|
return PdfNumeric::create($token + 0);
|
|
}
|
|
|
|
if ($token === 'true' || $token === 'false') {
|
|
$this->ensureExpectedType($token, $expectedType);
|
|
return PdfBoolean::create($token === 'true');
|
|
}
|
|
|
|
if ($token === 'null') {
|
|
$this->ensureExpectedType($token, $expectedType);
|
|
return new PdfNull();
|
|
}
|
|
|
|
if ($expectedType !== null && $expectedType !== PdfToken::class) {
|
|
throw new Type\PdfTypeException(
|
|
'Got unexpected token type.',
|
|
Type\PdfTypeException::INVALID_DATA_TYPE
|
|
);
|
|
}
|
|
|
|
$v = new PdfToken();
|
|
$v->value = $token;
|
|
|
|
return $v;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Ensures that the token will evaluate to an expected object type (or not).
|
|
*
|
|
* @param string $token
|
|
* @param string|null $expectedType
|
|
* @return bool
|
|
* @throws Type\PdfTypeException
|
|
*/
|
|
private function ensureExpectedType($token, $expectedType)
|
|
{
|
|
static $mapping = [
|
|
'(' => PdfString::class,
|
|
'<' => PdfHexString::class,
|
|
'<<' => PdfDictionary::class,
|
|
'/' => PdfName::class,
|
|
'[' => PdfArray::class,
|
|
'true' => PdfBoolean::class,
|
|
'false' => PdfBoolean::class,
|
|
'null' => PdfNull::class
|
|
];
|
|
|
|
if ($expectedType === null || $mapping[$token] === $expectedType) {
|
|
return true;
|
|
}
|
|
|
|
throw new Type\PdfTypeException('Got unexpected token type.', Type\PdfTypeException::INVALID_DATA_TYPE);
|
|
}
|
|
}
|