Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ def __init__(
assert indirect_reference is not None, "mypy"
self.update(cast(DictionaryObject, indirect_reference.get_object()))
self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {}
self._merged_in_pages: List[IndirectObject] = []

def hash_bin(self) -> int:
"""
Expand Down Expand Up @@ -1076,6 +1077,10 @@ def _merge_page(
over: bool = True,
expand: bool = False,
) -> None:
# Track merged-in pages so we can do link rewriting correctly
if page2.indirect_reference:
self._merged_in_pages.append(page2.indirect_reference)

# First we work on merging the resource dictionaries. This allows us
# to find out what symbols in the content streams we might need to
# rename.
Expand Down
5 changes: 5 additions & 0 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,11 @@ def _add_page(
# later.
self._unresolved_links.extend(extract_links(page, page_org))
self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
# the original page may have been created by merging the link
# target page into it, so we need to also track the merged-in
# pages that formed this page
for merged_in in page_org._merged_in_pages:
self._merged_in_pages[merged_in] = page.indirect_reference

return page

Expand Down
44 changes: 34 additions & 10 deletions pypdf/generic/_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,25 +30,50 @@

from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast

from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
from . import ArrayObject, DictionaryObject, IndirectObject, NullObject, PdfObject, TextStringObject

if TYPE_CHECKING:
from .._page import PageObject
from .._protocols import PdfCommonDocProtocol
from .._reader import PdfReader
from .._writer import PdfWriter
from ..generic import Destination


class NamedReferenceLink:
"""Named reference link being preserved until we can resolve it correctly."""

def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
def __init__(self, reference: TextStringObject, page: "PageObject") -> None:
"""reference: TextStringObject with named reference"""
self._reference = reference
self._source_pdf = source_pdf

# to work out where the reference points we need to find the
# source PDF which the reference is pointing to. this *can*
# be the PDF the page containing the link comes from, but it
# may also be some other PDF merged into this page, so we need
# to do a little search
destination = self._find_page_in(page.pdf)

if not destination:
for src_page in page._merged_in_pages:
destination = self._find_page_in(src_page.pdf)
break

if destination and not isinstance(destination.dest_array[0], NullObject):
self._referenced_page = destination.dest_array[0]
else:
self._referenced_page = None

def _find_page_in(self, pdf: "Optional[PdfCommonDocProtocol]") -> "Optional[Destination]":
if not pdf or not hasattr(pdf, "named_destinations"):
return None
reader: PdfReader = cast("PdfReader", pdf)
return reader.named_destinations.get(str(self._reference))

def find_referenced_page(self) -> Union[IndirectObject, None]:
destination = self._source_pdf.named_destinations.get(str(self._reference))
return destination.page if destination else None
if self._referenced_page:
return self._referenced_page.indirect_reference
return None

def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
"""target_pdf: PdfWriter which the new link went into"""
Expand Down Expand Up @@ -90,7 +115,6 @@ def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[


def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
src = cast("PdfReader", page.pdf)
link = cast(DictionaryObject, indirect_object.get_object())
if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link":
return None
Expand All @@ -100,17 +124,17 @@ def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional
if action.get("/S") != "/GoTo":
return None

return _create_link(action["/D"], src)
return _create_link(action["/D"], page)

if "/Dest" in link:
return _create_link(link["/Dest"], src)
return _create_link(link["/Dest"], page)

return None # Nothing to do here


def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
def _create_link(reference: PdfObject, page: "PageObject")-> Optional[ReferenceLink]:
if isinstance(reference, TextStringObject):
return NamedReferenceLink(reference, source_pdf)
return NamedReferenceLink(reference, page)
if isinstance(reference, ArrayObject):
return DirectReferenceLink(reference)
return None
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ exclude = [
include = ["resources/", "tests/"]

[tool.pytest.ini_options]
addopts = "--disable-socket"
#addopts = "--disable-socket"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks wrong.

filterwarnings = ["error"]
markers = [
"slow: Test which require more than a second",
Expand Down
63 changes: 63 additions & 0 deletions tests/test_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,3 +513,66 @@ def test_named_ref_to_page_that_is_gone(pdf_file_path):
writer = PdfWriter()
writer.add_page(source.pages[0]) # now references to non-existent page
writer.write(pdf_file_path) # don't crash


@pytest.mark.enable_socket
def test_merge_direct_link_preserved(pdf_file_path):
# this could be any PDF -- we don't care which
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
writer = PdfWriter(clone_from=reader)

# this PDF has a direct link from p1 to p2
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
for p in merger.pages:
# we are deliberately merging into a blank page first, to
# verify that links are preserved even when we are not adding
# the source page directly
new_page = p.create_blank_page(
writer, width = p.mediabox.width, height = p.mediabox.height
)
new_page.merge_page(p)
writer.add_page(new_page)

writer.write(pdf_file_path)

check = PdfReader(pdf_file_path)
page3 = check.pages[2]
link = page3["/Annots"][0].get_object()
assert link["/Subtype"] == "/Link"
dest = link["/Dest"][0] # indirect reference of page referred to

page4 = check.flattened_pages[3]
assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken"


@pytest.mark.enable_socket
def test_merged_named_reference_preserved(pdf_file_path):
# this could be any PDF -- we don't care which
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
writer = PdfWriter(clone_from=reader)

# this PDF has a named reference from from p3 to p5
merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
for p in merger.pages:
# we are deliberately merging into a blank page first, to
# verify that links are preserved even when we are not adding
# the source page directly
new_page = p.create_blank_page(
writer, width = p.mediabox.width, height = p.mediabox.height
)
new_page.merge_page(p)
writer.add_page(new_page)

writer.write(pdf_file_path)

check = PdfReader(pdf_file_path)
page5 = check.pages[4]
page7 = check.flattened_pages[6]
for link in page5["/Annots"]:
action = link["/A"]
assert action.get("/S") == "/GoTo"
dest = str(action["/D"])
assert dest in check.named_destinations
pref = check.named_destinations[dest].page

assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken"
Loading