Skip to content

Bibliography

Bibliography facade exposed through the TeXSmith public API.

Architecture
BibliographyCollection centralises reference merging, deduplication, and
portable export so higher layers can treat bibliographies as immutable
dictionaries. The collection records provenance internally so API consumers
can report issues with source context.
DoiBibliographyFetcher encapsulates remote lookups so IO code remains outside
the pure transformation layers. Callers provide a DOI and receive parsed
BibTeX data ready to inject into the collection.
bibliography_data_from_string accepts inline BibTeX payloads and converts them into BibliographyData objects, enabling templating systems to embed references alongside content.
Implementation Rationale
The public API needs a stable, documented entry point that is decoupled from
the evolving internal package layout. Re-exporting the curated primitives keeps
backward compatibility guarantees manageable.
Aggregation logic lives in collection.py so both the CLI and the programmatic API can reuse it. By funnelling access through this module we expose documentation and doctest examples close to the import surface users reach for first.

Usage Example

>>> from texsmith.core.bibliography import BibliographyCollection, bibliography_data_from_string
>>> collection = BibliographyCollection()
>>> payload = """@article{doe2023,
...   author = {Doe, Jane},
...   title = {A Minimal Example},
...   year = {2023},
... }"""
>>> inline = bibliography_data_from_string(payload, "doe2023")
>>> collection.load_data(inline, source="inline.bib")
>>> reference = collection.find("doe2023")
>>> reference["fields"]["title"]
'A Minimal Example'

BibliographyCollection

BibliographyCollection()

Aggregate references from one or more BibTeX sources.

Source code in src/texsmith/core/bibliography/collection.py
22
23
24
25
26
27
def __init__(self) -> None:
    self._entries: dict[str, Entry] = {}
    self._sources: dict[str, set[Path]] = {}
    self._issues: list[BibliographyIssue] = []
    self._file_entry_counts: dict[Path, int] = {}
    self._file_order: list[Path] = []

file_stats property

file_stats: Sequence[tuple[Path, int]]

Return (file, entry_count) pairs in the order files were processed.

issues property

issues: Sequence[BibliographyIssue]

Return the list of issues discovered while loading references.

clone

clone() -> BibliographyCollection

Return a deep copy of the collection without reparsing sources.

Source code in src/texsmith/core/bibliography/collection.py
203
204
205
206
207
208
209
210
211
def clone(self) -> BibliographyCollection:
    """Return a deep copy of the collection without reparsing sources."""
    cloned = BibliographyCollection()
    cloned._entries = copy.deepcopy(self._entries)
    cloned._sources = copy.deepcopy(self._sources)
    cloned._issues = list(self._issues)
    cloned._file_entry_counts = dict(self._file_entry_counts)
    cloned._file_order = list(self._file_order)
    return cloned

find

find(reference_key: str) -> dict[str, Any] | None

Return the portable representation of a specific reference.

Source code in src/texsmith/core/bibliography/collection.py
149
150
151
152
153
154
155
def find(self, reference_key: str) -> dict[str, Any] | None:
    """Return the portable representation of a specific reference."""
    entry = self._entries.get(reference_key)
    if entry is None:
        return None

    return self._portable_entry(reference_key, entry, self._sources[reference_key])

list_references

list_references() -> list[dict[str, Any]]

Return all references as portable dictionaries sorted by key.

Source code in src/texsmith/core/bibliography/collection.py
157
158
159
160
161
162
def list_references(self) -> list[dict[str, Any]]:
    """Return all references as portable dictionaries sorted by key."""
    portable: list[dict[str, Any]] = []
    for key in sorted(self._entries):
        portable.append(self._portable_entry(key, self._entries[key], self._sources[key]))
    return portable

load_data

load_data(
    data: BibliographyData,
    *,
    source: Path | str | None = None,
) -> None

Merge pre-parsed bibliography data into the collection.

Source code in src/texsmith/core/bibliography/collection.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def load_data(
    self,
    data: BibliographyData,
    *,
    source: Path | str | None = None,
) -> None:
    """Merge pre-parsed bibliography data into the collection."""
    source_path = self._resolve_source_path(source)
    entry_count = len(data.entries)
    self._file_entry_counts[source_path] = (
        self._file_entry_counts.get(source_path, 0) + entry_count
    )
    if source_path not in self._file_order:
        self._file_order.append(source_path)

    if entry_count == 0:
        self._issues.append(
            BibliographyIssue(
                message="No references found in inline bibliography data.",
                key=None,
                source=source_path,
            )
        )
        return

    self._merge_entries(data, source_path)

load_files

load_files(files: Iterable[Path | str]) -> None

Load BibTeX entries from one or more files.

Source code in src/texsmith/core/bibliography/collection.py
39
40
41
42
def load_files(self, files: Iterable[Path | str]) -> None:
    """Load BibTeX entries from one or more files."""
    for file_path in files:
        self._load_file(Path(file_path))

to_bibliography_data

to_bibliography_data(
    *, keys: Iterable[str] | None = None
) -> BibliographyData

Return a BibliographyData object scoped to the selected keys.

Source code in src/texsmith/core/bibliography/collection.py
171
172
173
174
175
176
177
178
def to_bibliography_data(self, *, keys: Iterable[str] | None = None) -> BibliographyData:
    """Return a BibliographyData object scoped to the selected keys."""
    if keys is None:
        entries = dict(self._entries)
    else:
        selected = {key for key in keys if key in self._entries}
        entries = {key: self._entries[key] for key in selected}
    return BibliographyData(entries=entries)

to_dict

to_dict() -> dict[str, dict[str, Any]]

Return a dictionary keyed by reference identifiers.

Source code in src/texsmith/core/bibliography/collection.py
164
165
166
167
168
169
def to_dict(self) -> dict[str, dict[str, Any]]:
    """Return a dictionary keyed by reference identifiers."""
    return {
        key: self._portable_entry(key, entry, self._sources[key])
        for key, entry in self._entries.items()
    }

write_bibtex

write_bibtex(
    target: Path | str, *, keys: Iterable[str] | None = None
) -> None

Persist the bibliography to a BibTeX file.

Source code in src/texsmith/core/bibliography/collection.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def write_bibtex(self, target: Path | str, *, keys: Iterable[str] | None = None) -> None:
    """Persist the bibliography to a BibTeX file."""
    path = Path(target)
    data = self.to_bibliography_data(keys=keys)
    raw_text = data.to_string("bibtex")
    sanitized_lines = []
    for line in raw_text.splitlines():
        stripped = line.lstrip().lower()
        if stripped.startswith("url =") or stripped.startswith("doi ="):
            line = line.replace(r"\_", "_")
        sanitized_lines.append(line)
    payload = "\n".join(sanitized_lines).rstrip() + "\n"
    try:
        existing = path.read_text(encoding="utf-8")
    except FileNotFoundError:
        existing = None
    except OSError:
        existing = None
    if existing == payload:
        return
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(payload, encoding="utf-8")

BibliographyIssue dataclass

BibliographyIssue(
    message: str,
    key: str | None = None,
    source: Path | None = None,
)

Represents a problem encountered while loading bibliography entries.

DoiBibliographyFetcher

DoiBibliographyFetcher(
    *,
    session: Session | None = None,
    timeout: float = 10.0,
    user_agent: str | None = None,
    cache: MutableMapping[str, str] | None = None,
    cache_dir: Path | None = None,
    enable_cache: bool = True,
)

Retrieve BibTeX entries for DOIs using content negotiation fallbacks.

Source code in src/texsmith/core/bibliography/doi.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def __init__(
    self,
    *,
    session: RequestsSession | None = None,
    timeout: float = 10.0,
    user_agent: str | None = None,
    cache: MutableMapping[str, str] | None = None,
    cache_dir: Path | None = None,
    enable_cache: bool = True,
) -> None:
    self._session_lock = Lock()
    self._session: RequestsSession | None = session
    self._timeout = timeout
    self._user_agent = user_agent or self._DEFAULT_USER_AGENT
    self._cache: MutableMapping[str, str] = cache or {}
    self._enable_cache = enable_cache
    resolved_cache_dir = self._resolve_cache_dir(cache_dir) if enable_cache else None
    if session is not None and cache_dir is None:
        resolved_cache_dir = None
    self._cache_dir = resolved_cache_dir if enable_cache else None

    if self._cache_dir is not None:
        self._cache_dir.mkdir(parents=True, exist_ok=True)

fetch

fetch(value: str) -> str

Return the BibTeX payload for a DOI, trying multiple providers.

Source code in src/texsmith/core/bibliography/doi.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def fetch(self, value: str) -> str:
    """Return the BibTeX payload for a DOI, trying multiple providers."""
    if requests is None:
        msg = (
            "Python 'requests' dependency is required to resolve DOIs. "
            "Install it via 'pip install requests'."
        )
        raise DoiLookupError(msg)

    doi = self._normalise(value)
    cached = self._read_cache(doi)
    if cached is not None:
        return cached

    attempts: list[str] = []
    client = self._ensure_session()
    for url, headers in self._candidate_requests(doi):
        try:
            response = client.get(url, headers=headers, timeout=self._timeout)
        except requests.RequestException as exc:
            attempts.append(f"{url}: {exc}")
            continue
        if response.status_code >= 400:
            attempts.append(f"{url}: HTTP {response.status_code}")
            continue
        content = response.text.strip()
        if content:
            self._write_cache(doi, content)
            return content
        attempts.append(f"{url}: empty response")
    detail = "; ".join(attempts) if attempts else "no responses"
    raise DoiLookupError(f"Unable to resolve DOI '{doi}': {detail}")

DoiLookupError

Bases: Exception

Raised when resolving a DOI to a BibTeX payload fails.

bibliography_data_from_inline_entry

bibliography_data_from_inline_entry(
    key: str, entry: InlineBibliographyEntry
) -> BibliographyData

Create a BibliographyData instance from a manual inline entry.

Source code in src/texsmith/core/bibliography/parsing.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def bibliography_data_from_inline_entry(
    key: str,
    entry: InlineBibliographyEntry,
) -> BibliographyData:
    """Create a BibliographyData instance from a manual inline entry."""
    if not entry.is_manual or not entry.entry_type:
        raise ValueError("Inline entry must define a manual type before conversion.")

    persons_payload = {
        role: [Person(name) for name in names] for role, names in entry.persons.items() if names
    }

    bib_entry = Entry(entry.entry_type, fields=dict(entry.fields), persons=persons_payload)
    return BibliographyData(entries={key: bib_entry})

bibliography_data_from_string

bibliography_data_from_string(
    payload: str, key: str
) -> BibliographyData

Parse a BibTeX payload and scope it to a specific reference key.

Source code in src/texsmith/core/bibliography/parsing.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def bibliography_data_from_string(payload: str, key: str) -> BibliographyData:
    """Parse a BibTeX payload and scope it to a specific reference key."""
    parser = bibtex.Parser()
    try:
        parsed = parser.parse_stream(io.StringIO(payload))
    except (OSError, PybtexError) as exc:
        raise PybtexError(f"Failed to parse inline bibliography payload: {exc}") from exc

    entries = list(parsed.entries.items())
    if not entries:
        raise PybtexError("Inline bibliography payload does not contain an entry.")
    if len(entries) > 1:
        raise PybtexError("Inline bibliography payload must contain a single entry.")

    _, entry = entries[0]
    return BibliographyData(entries={key: entry})

Aggregation utilities for BibTeX references.

BibliographyCollection

BibliographyCollection()

Aggregate references from one or more BibTeX sources.

Source code in src/texsmith/core/bibliography/collection.py
22
23
24
25
26
27
def __init__(self) -> None:
    self._entries: dict[str, Entry] = {}
    self._sources: dict[str, set[Path]] = {}
    self._issues: list[BibliographyIssue] = []
    self._file_entry_counts: dict[Path, int] = {}
    self._file_order: list[Path] = []

file_stats property

file_stats: Sequence[tuple[Path, int]]

Return (file, entry_count) pairs in the order files were processed.

issues property

issues: Sequence[BibliographyIssue]

Return the list of issues discovered while loading references.

clone

clone() -> BibliographyCollection

Return a deep copy of the collection without reparsing sources.

Source code in src/texsmith/core/bibliography/collection.py
203
204
205
206
207
208
209
210
211
def clone(self) -> BibliographyCollection:
    """Return a deep copy of the collection without reparsing sources."""
    cloned = BibliographyCollection()
    cloned._entries = copy.deepcopy(self._entries)
    cloned._sources = copy.deepcopy(self._sources)
    cloned._issues = list(self._issues)
    cloned._file_entry_counts = dict(self._file_entry_counts)
    cloned._file_order = list(self._file_order)
    return cloned

find

find(reference_key: str) -> dict[str, Any] | None

Return the portable representation of a specific reference.

Source code in src/texsmith/core/bibliography/collection.py
149
150
151
152
153
154
155
def find(self, reference_key: str) -> dict[str, Any] | None:
    """Return the portable representation of a specific reference."""
    entry = self._entries.get(reference_key)
    if entry is None:
        return None

    return self._portable_entry(reference_key, entry, self._sources[reference_key])

list_references

list_references() -> list[dict[str, Any]]

Return all references as portable dictionaries sorted by key.

Source code in src/texsmith/core/bibliography/collection.py
157
158
159
160
161
162
def list_references(self) -> list[dict[str, Any]]:
    """Return all references as portable dictionaries sorted by key."""
    portable: list[dict[str, Any]] = []
    for key in sorted(self._entries):
        portable.append(self._portable_entry(key, self._entries[key], self._sources[key]))
    return portable

load_data

load_data(
    data: BibliographyData,
    *,
    source: Path | str | None = None,
) -> None

Merge pre-parsed bibliography data into the collection.

Source code in src/texsmith/core/bibliography/collection.py
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def load_data(
    self,
    data: BibliographyData,
    *,
    source: Path | str | None = None,
) -> None:
    """Merge pre-parsed bibliography data into the collection."""
    source_path = self._resolve_source_path(source)
    entry_count = len(data.entries)
    self._file_entry_counts[source_path] = (
        self._file_entry_counts.get(source_path, 0) + entry_count
    )
    if source_path not in self._file_order:
        self._file_order.append(source_path)

    if entry_count == 0:
        self._issues.append(
            BibliographyIssue(
                message="No references found in inline bibliography data.",
                key=None,
                source=source_path,
            )
        )
        return

    self._merge_entries(data, source_path)

load_files

load_files(files: Iterable[Path | str]) -> None

Load BibTeX entries from one or more files.

Source code in src/texsmith/core/bibliography/collection.py
39
40
41
42
def load_files(self, files: Iterable[Path | str]) -> None:
    """Load BibTeX entries from one or more files."""
    for file_path in files:
        self._load_file(Path(file_path))

to_bibliography_data

to_bibliography_data(
    *, keys: Iterable[str] | None = None
) -> BibliographyData

Return a BibliographyData object scoped to the selected keys.

Source code in src/texsmith/core/bibliography/collection.py
171
172
173
174
175
176
177
178
def to_bibliography_data(self, *, keys: Iterable[str] | None = None) -> BibliographyData:
    """Return a BibliographyData object scoped to the selected keys."""
    if keys is None:
        entries = dict(self._entries)
    else:
        selected = {key for key in keys if key in self._entries}
        entries = {key: self._entries[key] for key in selected}
    return BibliographyData(entries=entries)

to_dict

to_dict() -> dict[str, dict[str, Any]]

Return a dictionary keyed by reference identifiers.

Source code in src/texsmith/core/bibliography/collection.py
164
165
166
167
168
169
def to_dict(self) -> dict[str, dict[str, Any]]:
    """Return a dictionary keyed by reference identifiers."""
    return {
        key: self._portable_entry(key, entry, self._sources[key])
        for key, entry in self._entries.items()
    }

write_bibtex

write_bibtex(
    target: Path | str, *, keys: Iterable[str] | None = None
) -> None

Persist the bibliography to a BibTeX file.

Source code in src/texsmith/core/bibliography/collection.py
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
def write_bibtex(self, target: Path | str, *, keys: Iterable[str] | None = None) -> None:
    """Persist the bibliography to a BibTeX file."""
    path = Path(target)
    data = self.to_bibliography_data(keys=keys)
    raw_text = data.to_string("bibtex")
    sanitized_lines = []
    for line in raw_text.splitlines():
        stripped = line.lstrip().lower()
        if stripped.startswith("url =") or stripped.startswith("doi ="):
            line = line.replace(r"\_", "_")
        sanitized_lines.append(line)
    payload = "\n".join(sanitized_lines).rstrip() + "\n"
    try:
        existing = path.read_text(encoding="utf-8")
    except FileNotFoundError:
        existing = None
    except OSError:
        existing = None
    if existing == payload:
        return
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(payload, encoding="utf-8")

Helpers for resolving DOIs to BibTeX payloads.

DoiBibliographyFetcher

DoiBibliographyFetcher(
    *,
    session: Session | None = None,
    timeout: float = 10.0,
    user_agent: str | None = None,
    cache: MutableMapping[str, str] | None = None,
    cache_dir: Path | None = None,
    enable_cache: bool = True,
)

Retrieve BibTeX entries for DOIs using content negotiation fallbacks.

Source code in src/texsmith/core/bibliography/doi.py
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
def __init__(
    self,
    *,
    session: RequestsSession | None = None,
    timeout: float = 10.0,
    user_agent: str | None = None,
    cache: MutableMapping[str, str] | None = None,
    cache_dir: Path | None = None,
    enable_cache: bool = True,
) -> None:
    self._session_lock = Lock()
    self._session: RequestsSession | None = session
    self._timeout = timeout
    self._user_agent = user_agent or self._DEFAULT_USER_AGENT
    self._cache: MutableMapping[str, str] = cache or {}
    self._enable_cache = enable_cache
    resolved_cache_dir = self._resolve_cache_dir(cache_dir) if enable_cache else None
    if session is not None and cache_dir is None:
        resolved_cache_dir = None
    self._cache_dir = resolved_cache_dir if enable_cache else None

    if self._cache_dir is not None:
        self._cache_dir.mkdir(parents=True, exist_ok=True)

fetch

fetch(value: str) -> str

Return the BibTeX payload for a DOI, trying multiple providers.

Source code in src/texsmith/core/bibliography/doi.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def fetch(self, value: str) -> str:
    """Return the BibTeX payload for a DOI, trying multiple providers."""
    if requests is None:
        msg = (
            "Python 'requests' dependency is required to resolve DOIs. "
            "Install it via 'pip install requests'."
        )
        raise DoiLookupError(msg)

    doi = self._normalise(value)
    cached = self._read_cache(doi)
    if cached is not None:
        return cached

    attempts: list[str] = []
    client = self._ensure_session()
    for url, headers in self._candidate_requests(doi):
        try:
            response = client.get(url, headers=headers, timeout=self._timeout)
        except requests.RequestException as exc:
            attempts.append(f"{url}: {exc}")
            continue
        if response.status_code >= 400:
            attempts.append(f"{url}: HTTP {response.status_code}")
            continue
        content = response.text.strip()
        if content:
            self._write_cache(doi, content)
            return content
        attempts.append(f"{url}: empty response")
    detail = "; ".join(attempts) if attempts else "no responses"
    raise DoiLookupError(f"Unable to resolve DOI '{doi}': {detail}")

DoiLookupError

Bases: Exception

Raised when resolving a DOI to a BibTeX payload fails.

normalise_doi

normalise_doi(value: str) -> str

Return a canonical representation for DOI strings.

Source code in src/texsmith/core/bibliography/doi.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def normalise_doi(value: str) -> str:
    """Return a canonical representation for DOI strings."""
    if not isinstance(value, str):
        raise DoiLookupError("DOI must be provided as a string.")
    candidate = value.strip()
    if not candidate:
        raise DoiLookupError("DOI value is empty.")

    lowered = candidate.lower()
    for prefix in (
        "https://doi.org/",
        "http://doi.org/",
        "https://dx.doi.org/",
        "http://dx.doi.org/",
    ):
        if lowered.startswith(prefix):
            candidate = candidate[len(prefix) :]
            break

    candidate = candidate.strip()
    if candidate.lower().startswith("doi:"):
        candidate = candidate.split(":", 1)[1]

    candidate = candidate.strip().strip("/")
    if not candidate:
        raise DoiLookupError("DOI value is empty.")
    return candidate

Shared data structures for bibliography processing.

BibliographyIssue dataclass

BibliographyIssue(
    message: str,
    key: str | None = None,
    source: Path | None = None,
)

Represents a problem encountered while loading bibliography entries.

Parsing helpers for bibliography payloads.

bibliography_data_from_inline_entry

bibliography_data_from_inline_entry(
    key: str, entry: InlineBibliographyEntry
) -> BibliographyData

Create a BibliographyData instance from a manual inline entry.

Source code in src/texsmith/core/bibliography/parsing.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
def bibliography_data_from_inline_entry(
    key: str,
    entry: InlineBibliographyEntry,
) -> BibliographyData:
    """Create a BibliographyData instance from a manual inline entry."""
    if not entry.is_manual or not entry.entry_type:
        raise ValueError("Inline entry must define a manual type before conversion.")

    persons_payload = {
        role: [Person(name) for name in names] for role, names in entry.persons.items() if names
    }

    bib_entry = Entry(entry.entry_type, fields=dict(entry.fields), persons=persons_payload)
    return BibliographyData(entries={key: bib_entry})

bibliography_data_from_string

bibliography_data_from_string(
    payload: str, key: str
) -> BibliographyData

Parse a BibTeX payload and scope it to a specific reference key.

Source code in src/texsmith/core/bibliography/parsing.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
def bibliography_data_from_string(payload: str, key: str) -> BibliographyData:
    """Parse a BibTeX payload and scope it to a specific reference key."""
    parser = bibtex.Parser()
    try:
        parsed = parser.parse_stream(io.StringIO(payload))
    except (OSError, PybtexError) as exc:
        raise PybtexError(f"Failed to parse inline bibliography payload: {exc}") from exc

    entries = list(parsed.entries.items())
    if not entries:
        raise PybtexError("Inline bibliography payload does not contain an entry.")
    if len(entries) > 1:
        raise PybtexError("Inline bibliography payload must contain a single entry.")

    _, entry = entries[0]
    return BibliographyData(entries={key: entry})