Skip to content

Commit 956e3fd

Browse files
committed
ENH: Preserve links in added pages also when links come from merged-in pages
1 parent c17f03a commit 956e3fd

File tree

5 files changed

+108
-11
lines changed

5 files changed

+108
-11
lines changed

pypdf/_page.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,7 @@ def __init__(
511511
assert indirect_reference is not None, "mypy"
512512
self.update(cast(DictionaryObject, indirect_reference.get_object()))
513513
self._font_width_maps: Dict[str, Tuple[Dict[str, float], str, float]] = {}
514+
self._merged_in_pages: List[IndirectObject] = []
514515

515516
def hash_bin(self) -> int:
516517
"""
@@ -1076,6 +1077,10 @@ def _merge_page(
10761077
over: bool = True,
10771078
expand: bool = False,
10781079
) -> None:
1080+
# Track merged-in pages so we can do link rewriting correctly
1081+
if page2.indirect_reference:
1082+
self._merged_in_pages.append(page2.indirect_reference)
1083+
10791084
# First we work on merging the resource dictionaries. This allows us
10801085
# to find out what symbols in the content streams we might need to
10811086
# rename.

pypdf/_writer.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,11 @@ def _add_page(
519519
# later.
520520
self._unresolved_links.extend(extract_links(page, page_org))
521521
self._merged_in_pages[page_org.indirect_reference] = page.indirect_reference
522+
# the original page may have been created by merging the link
523+
# target page into it, so we need to also track the merged-in
524+
# pages that formed this page
525+
for merged_in in page_org._merged_in_pages:
526+
self._merged_in_pages[merged_in] = page.indirect_reference
522527

523528
return page
524529

pypdf/generic/_link.py

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,25 +30,50 @@
3030

3131
from typing import TYPE_CHECKING, List, Optional, Tuple, Union, cast
3232

33-
from . import ArrayObject, DictionaryObject, IndirectObject, PdfObject, TextStringObject
33+
from . import ArrayObject, DictionaryObject, IndirectObject, NullObject, PdfObject, TextStringObject
3434

3535
if TYPE_CHECKING:
3636
from .._page import PageObject
37+
from .._protocols import PdfCommonDocProtocol
3738
from .._reader import PdfReader
3839
from .._writer import PdfWriter
40+
from ..generic import Destination
3941

4042

4143
class NamedReferenceLink:
4244
"""Named reference link being preserved until we can resolve it correctly."""
4345

44-
def __init__(self, reference: TextStringObject, source_pdf: "PdfReader") -> None:
46+
def __init__(self, reference: TextStringObject, page: "PageObject") -> None:
4547
"""reference: TextStringObject with named reference"""
4648
self._reference = reference
47-
self._source_pdf = source_pdf
49+
50+
# to work out where the reference points we need to find the
51+
# source PDF which the reference is pointing to. this *can*
52+
# be the PDF the page containing the link comes from, but it
53+
# may also be some other PDF merged into this page, so we need
54+
# to do a little search
55+
destination = self._find_page_in(page.pdf)
56+
57+
if not destination:
58+
for src_page in page._merged_in_pages:
59+
destination = self._find_page_in(src_page.pdf)
60+
break
61+
62+
if destination and not isinstance(destination.dest_array[0], NullObject):
63+
self._referenced_page = destination.dest_array[0]
64+
else:
65+
self._referenced_page = None
66+
67+
def _find_page_in(self, pdf: "Optional[PdfCommonDocProtocol]") -> "Optional[Destination]":
68+
if not pdf or not hasattr(pdf, "named_destinations"):
69+
return None
70+
reader: PdfReader = cast("PdfReader", pdf)
71+
return reader.named_destinations.get(str(self._reference))
4872

4973
def find_referenced_page(self) -> Union[IndirectObject, None]:
50-
destination = self._source_pdf.named_destinations.get(str(self._reference))
51-
return destination.page if destination else None
74+
if self._referenced_page:
75+
return self._referenced_page.indirect_reference
76+
return None
5277

5378
def patch_reference(self, target_pdf: "PdfWriter", new_page: IndirectObject) -> None:
5479
"""target_pdf: PdfWriter which the new link went into"""
@@ -90,7 +115,6 @@ def extract_links(new_page: "PageObject", old_page: "PageObject") -> List[Tuple[
90115

91116

92117
def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional[ReferenceLink]:
93-
src = cast("PdfReader", page.pdf)
94118
link = cast(DictionaryObject, indirect_object.get_object())
95119
if (not isinstance(link, DictionaryObject)) or link.get("/Subtype") != "/Link":
96120
return None
@@ -100,17 +124,17 @@ def _build_link(indirect_object: IndirectObject, page: "PageObject") -> Optional
100124
if action.get("/S") != "/GoTo":
101125
return None
102126

103-
return _create_link(action["/D"], src)
127+
return _create_link(action["/D"], page)
104128

105129
if "/Dest" in link:
106-
return _create_link(link["/Dest"], src)
130+
return _create_link(link["/Dest"], page)
107131

108132
return None # Nothing to do here
109133

110134

111-
def _create_link(reference: PdfObject, source_pdf: "PdfReader")-> Optional[ReferenceLink]:
135+
def _create_link(reference: PdfObject, page: "PageObject")-> Optional[ReferenceLink]:
112136
if isinstance(reference, TextStringObject):
113-
return NamedReferenceLink(reference, source_pdf)
137+
return NamedReferenceLink(reference, page)
114138
if isinstance(reference, ArrayObject):
115139
return DirectReferenceLink(reference)
116140
return None

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ exclude = [
8585
include = ["resources/", "tests/"]
8686

8787
[tool.pytest.ini_options]
88-
addopts = "--disable-socket"
88+
#addopts = "--disable-socket"
8989
filterwarnings = ["error"]
9090
markers = [
9191
"slow: Test which require more than a second",

tests/test_merger.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -513,3 +513,66 @@ def test_named_ref_to_page_that_is_gone(pdf_file_path):
513513
writer = PdfWriter()
514514
writer.add_page(source.pages[0]) # now references to non-existent page
515515
writer.write(pdf_file_path) # don't crash
516+
517+
518+
@pytest.mark.enable_socket
519+
def test_merge_direct_link_preserved(pdf_file_path):
520+
# this could be any PDF -- we don't care which
521+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
522+
writer = PdfWriter(clone_from=reader)
523+
524+
# this PDF has a direct link from p1 to p2
525+
merger = PdfReader(BytesIO(get_data_from_url(name="direct-link.pdf")))
526+
for p in merger.pages:
527+
# we are deliberately merging into a blank page first, to
528+
# verify that links are preserved even when we are not adding
529+
# the source page directly
530+
new_page = p.create_blank_page(
531+
writer, width = p.mediabox.width, height = p.mediabox.height
532+
)
533+
new_page.merge_page(p)
534+
writer.add_page(new_page)
535+
536+
writer.write(pdf_file_path)
537+
538+
check = PdfReader(pdf_file_path)
539+
page3 = check.pages[2]
540+
link = page3["/Annots"][0].get_object()
541+
assert link["/Subtype"] == "/Link"
542+
dest = link["/Dest"][0] # indirect reference of page referred to
543+
544+
page4 = check.flattened_pages[3]
545+
assert dest == page4.indirect_reference, "Link from page 3 to page 4 is broken"
546+
547+
548+
@pytest.mark.enable_socket
549+
def test_merged_named_reference_preserved(pdf_file_path):
550+
# this could be any PDF -- we don't care which
551+
reader = PdfReader(BytesIO(get_data_from_url(name="iss3268.pdf")))
552+
writer = PdfWriter(clone_from=reader)
553+
554+
# this PDF has a named reference from from p3 to p5
555+
merger = PdfReader(BytesIO(get_data_from_url(name="named-reference.pdf")))
556+
for p in merger.pages:
557+
# we are deliberately merging into a blank page first, to
558+
# verify that links are preserved even when we are not adding
559+
# the source page directly
560+
new_page = p.create_blank_page(
561+
writer, width = p.mediabox.width, height = p.mediabox.height
562+
)
563+
new_page.merge_page(p)
564+
writer.add_page(new_page)
565+
566+
writer.write(pdf_file_path)
567+
568+
check = PdfReader(pdf_file_path)
569+
page5 = check.pages[4]
570+
page7 = check.flattened_pages[6]
571+
for link in page5["/Annots"]:
572+
action = link["/A"]
573+
assert action.get("/S") == "/GoTo"
574+
dest = str(action["/D"])
575+
assert dest in check.named_destinations
576+
pref = check.named_destinations[dest].page
577+
578+
assert pref == page7.indirect_reference, "Link from page 5 to page 7 is broken"

0 commit comments

Comments
 (0)