"""Text splitting utilities using spaCy for NLP-based sentence and clause detection."""
from __future__ import annotations
import re
from typing import TYPE_CHECKING, NamedTuple
from phrasplit.abbreviations import (
get_abbreviations,
get_sentence_ending_abbreviations,
get_sentence_starters,
)
[docs]
class Segment(NamedTuple):
"""A text segment with position information.
Attributes:
text: The text content of the segment
paragraph: Paragraph index (0-based) within the document
sentence: Sentence index (0-based) within the paragraph.
None for paragraph mode.
"""
text: str
paragraph: int
sentence: int | None = None
if TYPE_CHECKING:
from spacy.language import Language # type: ignore[import-not-found]
try:
import spacy # type: ignore[import-not-found]
SPACY_AVAILABLE = True
except ImportError:
SPACY_AVAILABLE = False
spacy = None
# Cache for loaded spaCy model
_nlp_cache: dict[str, Language] = {}
# Placeholders for ellipsis during spaCy processing
# We use Unicode private use area characters to avoid collision with real text
_ELLIPSIS_3_PLACEHOLDER = "\ue000" # 3 dots: ...
_ELLIPSIS_4_PLACEHOLDER = "\ue001" # 4 dots: ....
_ELLIPSIS_SPACED_PLACEHOLDER = "\ue002" # Spaced: . . .
_ELLIPSIS_UNICODE_PLACEHOLDER = "\ue003" # Unicode ellipsis: …
_ELLIPSIS_LONG_PREFIX = "\ue004" # Prefix for 5+ dots (followed by count digit)
# Regex for hyphenated line breaks (e.g., "recom-\nmendation" -> "recommendation")
_HYPHENATED_LINEBREAK = re.compile(r"(\w+)-\s*\n\s*(\w+)")
# URL pattern for splitting
_URL_PATTERN = re.compile(r"(https?://\S+)")
# Pattern to detect abbreviation at end of sentence
# Matches: word ending with period, where word (without period) is in abbreviations
_ABBREV_END_PATTERN = re.compile(r"(\b[A-Za-z]+)\.\s*$")
# Default maximum chunk size for spaCy processing (will be capped by nlp.max_length)
_DEFAULT_MAX_CHUNK_SIZE = 500000
# Safety margin at chunk boundaries to avoid cutting sentences
_DEFAULT_SAFETY_MARGIN = 100
def _fix_hyphenated_linebreaks(text: str) -> str:
"""
Fix hyphenated line breaks commonly found in PDFs and OCR text.
Joins words that were split across lines with a hyphen.
Example: "recom-\\nmendation" -> "recommendation"
Args:
text: Input text
Returns:
Text with hyphenated line breaks fixed
"""
return _HYPHENATED_LINEBREAK.sub(r"\1\2", text)
def _normalize_whitespace(text: str) -> str:
"""
Normalize multiple whitespace characters to single spaces.
Preserves paragraph breaks (double newlines) but normalizes
other whitespace sequences.
Args:
text: Input text
Returns:
Text with normalized whitespace
"""
# First preserve paragraph breaks by using a placeholder
text = re.sub(r"\n\s*\n", "\n\n", text)
# Normalize other whitespace (but not newlines in paragraph breaks)
text = re.sub(r"[^\S\n]+", " ", text)
return text
def _preprocess_text(text: str) -> str:
"""
Apply preprocessing steps to clean up text before NLP processing.
Steps:
1. Fix hyphenated line breaks (common in PDFs)
2. Normalize whitespace
Args:
text: Input text
Returns:
Preprocessed text
"""
text = _fix_hyphenated_linebreaks(text)
text = _normalize_whitespace(text)
return text
def _protect_ellipsis(text: str) -> str:
"""
Replace ellipsis patterns with placeholders to prevent sentence splitting.
Handles:
- Spaced ellipsis: ". . ." (dot-space-dot-space-dot)
- Regular ellipsis: "..." (three consecutive dots)
- Four dots: "...." (often used for ellipsis + period)
- Five or more dots: "....." etc.
- Unicode ellipsis: U+2026 (single ellipsis character)
Each pattern is replaced with a unique placeholder that preserves information
about the original format, allowing exact restoration later.
"""
# Replace spaced ellipsis first (. . .) - must come before regular dots
text = text.replace(". . .", _ELLIPSIS_SPACED_PLACEHOLDER)
# Replace unicode ellipsis
text = text.replace("\u2026", _ELLIPSIS_UNICODE_PLACEHOLDER)
# Replace longer dot sequences first (5+ dots), encoding the count
# Use offset of 0xE010 (private use area) to avoid control characters
# chr(0) - chr(31) are control chars, chr(9) is tab, chr(10) is newline
def replace_long_dots(match: re.Match[str]) -> str:
count = len(match.group(0))
# Encode count in private use area: U+E010 + count
# This avoids control characters and whitespace
return _ELLIPSIS_LONG_PREFIX + chr(0xE010 + count)
text = re.sub(r"\.{5,}", replace_long_dots, text)
# Replace 4 dots
text = text.replace("....", _ELLIPSIS_4_PLACEHOLDER)
# Replace 3 dots (must come after 4+ to avoid partial matches)
text = text.replace("...", _ELLIPSIS_3_PLACEHOLDER)
return text
def _restore_ellipsis(text: str) -> str:
"""Restore ellipsis placeholders back to their original format."""
# Restore in reverse order of protection
# Restore 3 dots
text = text.replace(_ELLIPSIS_3_PLACEHOLDER, "...")
# Restore 4 dots
text = text.replace(_ELLIPSIS_4_PLACEHOLDER, "....")
# Restore long dot sequences (5+)
def restore_long_dots(match: re.Match[str]) -> str:
# Decode count from private use area offset
count = ord(match.group(1)) - 0xE010
return "." * count
# Use re.DOTALL so (.) matches any character including newline (chr(10))
text = re.sub(
_ELLIPSIS_LONG_PREFIX + r"(.)", restore_long_dots, text, flags=re.DOTALL
)
# Restore unicode ellipsis
text = text.replace(_ELLIPSIS_UNICODE_PLACEHOLDER, "\u2026")
# Restore spaced ellipsis
text = text.replace(_ELLIPSIS_SPACED_PLACEHOLDER, ". . .")
return text
def _split_urls(sentences: list[str]) -> list[str]:
"""
Split sentences that contain multiple URLs.
URLs are often listed one per line in source text, but spaCy may merge them.
This function splits sentences only when there are 2+ URLs present.
Args:
sentences: List of sentences from spaCy
Returns:
List of sentences with multiple URLs properly separated
"""
result: list[str] = []
for sent in sentences:
# Check if sentence contains URLs
if "http://" not in sent and "https://" not in sent:
result.append(sent)
continue
# Count URLs in the sentence
url_matches = list(_URL_PATTERN.finditer(sent))
# Only split if there are multiple URLs
if len(url_matches) < 2:
result.append(sent)
continue
# Split at URL boundaries - each URL becomes its own "sentence"
# along with any text that follows it until the next URL
last_end = 0
for i, match in enumerate(url_matches):
# Text before this URL (only for first URL)
if i == 0 and match.start() > 0:
prefix = sent[: match.start()].strip()
if prefix:
# Include prefix with first URL
next_url_start = (
url_matches[i + 1].start()
if i + 1 < len(url_matches)
else len(sent)
)
part = sent[:next_url_start].strip()
result.append(part)
last_end = next_url_start
continue
# For subsequent URLs or if no prefix
if match.start() >= last_end:
next_url_start = (
url_matches[i + 1].start()
if i + 1 < len(url_matches)
else len(sent)
)
part = sent[match.start() : next_url_start].strip()
if part:
result.append(part)
last_end = next_url_start
return result
def _merge_abbreviation_splits(
sentences: list[str],
language_model: str = "en_core_web_sm",
) -> list[str]:
"""
Merge sentences that were incorrectly split after abbreviations.
spaCy sometimes splits after abbreviations like "M.D." or "U.S." when
followed by a name or continuation. This function merges such cases.
Conservative approach: only merge if:
1. Previous sentence ends with a known abbreviation + period
2. The abbreviation is NOT one that commonly ends sentences (etc., Inc., etc.)
3. Next sentence starts with a capital letter (likely a name/continuation)
4. Next sentence does NOT start with a common sentence starter
Args:
sentences: List of sentences from spaCy
language_model: spaCy language model name (for language-specific abbreviations)
Returns:
List of sentences with abbreviation splits merged
"""
# Get language-specific abbreviations
abbreviations = get_abbreviations(language_model)
# If no abbreviations for this language, return unchanged
if not abbreviations:
return sentences
if len(sentences) <= 1:
return sentences
# Get common sentence starters and sentence-ending abbreviations
sentence_starters = get_sentence_starters()
sentence_ending_abbrevs = get_sentence_ending_abbreviations()
result: list[str] = []
i = 0
while i < len(sentences):
current = sentences[i]
# Check if we should merge with the next sentence
if i + 1 < len(sentences):
next_sent = sentences[i + 1]
# Check if current sentence ends with an abbreviation
match = _ABBREV_END_PATTERN.search(current)
if match:
abbrev = match.group(1)
# Check if it's a known abbreviation for this language
# BUT skip if it's an abbreviation that commonly ends sentences
if abbrev in abbreviations and abbrev not in sentence_ending_abbrevs:
# Check if next sentence starts with a word that's likely a name
# (capital letter, not a common sentence starter)
next_words = next_sent.split()
if next_words:
first_word = next_words[0]
# Merge if first word is capitalized but not a sentence starter
# and not all caps (which might be an acronym/heading)
if (
first_word[0].isupper()
and first_word not in sentence_starters
and not first_word.isupper()
):
# Merge the sentences
merged = current + " " + next_sent
result.append(merged)
i += 2
continue
result.append(current)
i += 1
return result
# Pattern to detect ellipsis followed by a new sentence
# Matches: 3+ dots OR spaced ellipsis, followed by whitespace,
# optional quotes, and capital letter
_ELLIPSIS_SENTENCE_BREAK = re.compile(
r'(\.{3,}|\. \. \.)\s+(["\'\u201c\u201d\u2018\u2019]*[A-Z])',
)
def _split_after_ellipsis(sentences: list[str]) -> list[str]:
"""
Split sentences that contain ellipsis followed by a new sentence.
When text like "He was tired.... The next day" is processed, spaCy may not
recognize the sentence boundary after the ellipsis. This function splits
such cases by detecting ellipsis (3+ dots or ". . .") followed by whitespace
and a capital letter (optionally preceded by quotes).
Args:
sentences: List of sentences from spaCy
Returns:
List of sentences with ellipsis boundaries properly handled
"""
if not sentences:
return sentences
# Split sentences containing ellipsis followed by capital letter
result: list[str] = []
for sent in sentences:
# Check if sentence contains ellipsis followed by capital letter
match = _ELLIPSIS_SENTENCE_BREAK.search(sent)
if not match:
result.append(sent)
continue
# Split at the boundary (keep ellipsis with first part)
# We need to handle multiple potential splits in one sentence
remaining = sent
while True:
match = _ELLIPSIS_SENTENCE_BREAK.search(remaining)
if not match:
if remaining.strip():
result.append(remaining.strip())
break
# Split: everything up to and including ellipsis goes to first part
# The capital letter starts the second part
split_pos = match.end(1) # End of ellipsis
first_part = remaining[:split_pos].strip()
remaining = remaining[split_pos:].strip()
if first_part:
result.append(first_part)
return result
def _apply_corrections(
sentences: list[str],
language_model: str = "en_core_web_sm",
split_on_colon: bool = True,
nlp: Language | None = None,
) -> list[str]:
"""
Apply post-processing corrections to fix common spaCy segmentation errors.
Corrections applied (in order):
1. Merge sentences incorrectly split after abbreviations (reduces count)
2. Split sentences after ellipsis followed by capital letter (increases count)
3. Split sentences containing multiple URLs (increases count)
Note: Colon handling is minimal - we let spaCy handle colons naturally.
The split_on_colon parameter is kept for API compatibility but currently
has no effect (spaCy's default colon behavior is used).
Args:
sentences: List of sentences from spaCy
language_model: spaCy language model name (for language-specific corrections)
split_on_colon: Kept for API compatibility (currently unused)
nlp: Optional spaCy language model (currently unused)
Returns:
Corrected list of sentences
"""
# First merge abbreviation splits (need to combine before other splits)
sentences = _merge_abbreviation_splits(sentences, language_model)
# Split after ellipsis followed by new sentence
sentences = _split_after_ellipsis(sentences)
# Split URLs (increases sentence count)
sentences = _split_urls(sentences)
return sentences
def _get_nlp(language_model: str = "en_core_web_sm") -> Language:
"""Get or load a spaCy model (cached).
Args:
language_model: Name of the spaCy language model to load
Returns:
Loaded spaCy Language model
Raises:
ImportError: If spaCy is not installed
OSError: If the specified language model is not found
"""
if not SPACY_AVAILABLE:
raise ImportError(
"spaCy is required for this feature. Install with: pip install phrasplit"
)
if language_model not in _nlp_cache:
try:
# spacy is guaranteed to be not None here due to SPACY_AVAILABLE check above
assert spacy is not None
_nlp_cache[language_model] = spacy.load(language_model)
except OSError:
raise OSError(
f"spaCy language model '{language_model}' not found. "
f"Download with: python -m spacy download {language_model}"
) from None
return _nlp_cache[language_model]
def _extract_sentences(doc) -> list[str]:
"""Extract sentences from a spaCy Doc object.
Args:
doc: A spaCy Doc object
Returns:
List of sentence strings (stripped, non-empty)
"""
return [sent.text.strip() for sent in doc.sents if sent.text.strip()]
def _process_long_text(
text: str,
nlp: Language,
max_chunk: int = _DEFAULT_MAX_CHUNK_SIZE,
safety_margin: int = _DEFAULT_SAFETY_MARGIN,
) -> list[str]:
"""Process text that may exceed spaCy's max_length incrementally.
Uses index-based tracking to extract sentences from long text without
cutting sentences at chunk boundaries.
Args:
text: Input text (should be preprocessed, ellipsis protected)
nlp: spaCy Language model
max_chunk: Maximum characters to process at once
safety_margin: Buffer at chunk end to avoid cutting sentences
Returns:
List of sentence strings (stripped, non-empty)
"""
# Cap max_chunk to spaCy's limit minus safety margin
effective_max = min(max_chunk, nlp.max_length - safety_margin)
if len(text) <= effective_max:
doc = nlp(text)
return _extract_sentences(doc)
sentences: list[str] = []
start_idx = 0
while start_idx < len(text):
end_idx = min(start_idx + effective_max, len(text))
chunk = text[start_idx:end_idx]
doc = nlp(chunk)
if end_idx >= len(text):
# Last chunk - take all sentences
sentences.extend(_extract_sentences(doc))
break
# Not the last chunk - keep only complete sentences
last_complete_end = 0
for sent in doc.sents:
sent_text = sent.text.strip()
if sent_text and sent.end_char < len(chunk) - safety_margin:
sentences.append(sent_text)
last_complete_end = sent.end_char
# Move start index forward
if last_complete_end > 0:
start_idx += last_complete_end
else:
# No sentence boundary found - take all and move on
sentences.extend(_extract_sentences(doc))
start_idx = end_idx
# Skip leading whitespace for next iteration
while start_idx < len(text) and text[start_idx] in " \t\n\r":
start_idx += 1
return sentences
[docs]
def split_paragraphs(text: str) -> list[str]:
"""
Split text into paragraphs (separated by double newlines).
Applies preprocessing to fix hyphenated line breaks and normalize whitespace.
Args:
text: Input text
Returns:
List of paragraphs (non-empty, stripped)
"""
text = _preprocess_text(text)
paragraphs = re.split(r"\n\s*\n", text)
return [p.strip() for p in paragraphs if p.strip()]
[docs]
def split_sentences(
text: str,
language_model: str = "en_core_web_sm",
apply_corrections: bool = True,
split_on_colon: bool = True,
) -> list[str]:
"""
Split text into sentences using spaCy.
Args:
text: Input text
language_model: spaCy language model to use
apply_corrections: Whether to apply post-processing corrections for
common spaCy errors (URL splitting, abbreviation handling).
Default is True.
split_on_colon: Kept for API compatibility (currently unused).
spaCy's default colon behavior is used. Default is True.
Returns:
List of sentences
"""
nlp = _get_nlp(language_model)
paragraphs = split_paragraphs(text)
if not paragraphs:
return []
result: list[str] = []
for para in paragraphs:
# Protect ellipsis from being treated as sentence boundaries
para = _protect_ellipsis(para)
# Process paragraph into sentences (handles long text)
sentences = _process_long_text(para, nlp)
for sent in sentences:
# Restore ellipsis in the sentence
sent = _restore_ellipsis(sent)
result.append(sent)
# Apply post-processing corrections if enabled
if apply_corrections:
result = _apply_corrections(result, language_model, split_on_colon, nlp)
return result
def _split_sentence_into_clauses(sentence: str) -> list[str]:
"""
Split a sentence into comma-separated parts for audiobook creation.
Splits only at commas, keeping the comma at the end of each part.
This creates natural pause points for text-to-speech processing.
Args:
sentence: A single sentence
Returns:
List of comma-separated parts
"""
# Pattern to split after comma followed by space
# Using positive lookbehind to keep comma at end of clause
parts = re.split(r"(?<=,)\s+", sentence)
# Filter empty parts and strip whitespace
clauses = [p.strip() for p in parts if p.strip()]
return clauses if clauses else [sentence]
[docs]
def split_clauses(
text: str,
language_model: str = "en_core_web_sm",
) -> list[str]:
"""
Split text into comma-separated parts for audiobook creation.
Uses spaCy for sentence detection, then splits each sentence at commas.
The comma stays at the end of each part, creating natural pause points
for text-to-speech processing.
Args:
text: Input text
language_model: spaCy language model to use
Returns:
List of comma-separated parts
Example:
Input: "I do like coffee, and I like wine."
Output: ["I do like coffee,", "and I like wine."]
"""
nlp = _get_nlp(language_model)
paragraphs = split_paragraphs(text)
if not paragraphs:
return []
result: list[str] = []
for para in paragraphs:
# Protect ellipsis from being treated as sentence boundaries
para = _protect_ellipsis(para)
# Process paragraph into sentences (handles long text)
sentences = _process_long_text(para, nlp)
# Process each sentence into clauses
for sent in sentences:
# Restore ellipsis in the sentence
sent = _restore_ellipsis(sent)
# Split sentence at clause boundaries
clauses = _split_sentence_into_clauses(sent)
result.extend(clauses)
return result
def _split_at_clauses(text: str, max_length: int) -> list[str]:
"""
Split text at comma boundaries for audiobook creation.
Args:
text: Text to split
max_length: Maximum line length
Returns:
List of lines
"""
# Split at commas, keeping the comma with the preceding text
parts = re.split(r"(?<=,)\s+", text)
result: list[str] = []
current_line = ""
for part in parts:
part = part.strip()
if not part:
continue
if not current_line:
current_line = part
elif len(current_line) + 1 + len(part) <= max_length:
current_line += " " + part
else:
if current_line:
result.append(current_line)
current_line = part
if current_line:
result.append(current_line)
# If still too long, do hard split at word boundaries
final_result: list[str] = []
for line in result:
if len(line) > max_length:
final_result.extend(_hard_split(line, max_length))
else:
final_result.append(line)
return final_result if final_result else [text]
def _hard_split(text: str, max_length: int) -> list[str]:
"""
Hard split text at word boundaries when clause splitting isn't enough.
Args:
text: Text to split
max_length: Maximum line length
Returns:
List of lines
"""
words = text.split()
result: list[str] = []
current_line = ""
for word in words:
if not current_line:
current_line = word
elif len(current_line) + 1 + len(word) <= max_length:
current_line += " " + word
else:
result.append(current_line)
current_line = word
if current_line:
result.append(current_line)
return result if result else [text]
def _split_at_boundaries(text: str, max_length: int, nlp: Language) -> list[str]:
"""
Split text at sentence/clause boundaries to fit within max_length.
Args:
text: Text to split
max_length: Maximum line length
nlp: spaCy language model
Returns:
List of lines
"""
# Protect ellipsis before spaCy processing
protected_text = _protect_ellipsis(text)
# Split into sentences (handles long text)
sentences = _process_long_text(protected_text, nlp)
result: list[str] = []
current_line = ""
for sent in sentences:
# Restore ellipsis in the sentence
sent = _restore_ellipsis(sent)
# If sentence itself exceeds max_length, split at clauses
if len(sent) > max_length:
# Flush current line first
if current_line:
result.append(current_line)
current_line = ""
# Split sentence at clause boundaries
clause_lines = _split_at_clauses(sent, max_length)
result.extend(clause_lines)
elif not current_line:
current_line = sent
elif len(current_line) + 1 + len(sent) <= max_length:
current_line += " " + sent
else:
result.append(current_line)
current_line = sent
if current_line:
result.append(current_line)
return result if result else [text]
[docs]
def split_long_lines(
text: str,
max_length: int,
language_model: str = "en_core_web_sm",
) -> list[str]:
"""
Split lines exceeding max_length at clause/sentence boundaries.
Strategy:
1. First try to split at sentence boundaries
2. If still too long, split at clause boundaries (commas, semicolons, etc.)
3. If still too long, split at word boundaries
Args:
text: Input text
max_length: Maximum line length in characters (must be positive)
language_model: spaCy language model to use
Returns:
List of lines, each within max_length (except single words exceeding limit)
Raises:
ValueError: If max_length is less than 1
"""
if max_length < 1:
raise ValueError(f"max_length must be at least 1, got {max_length}")
nlp = _get_nlp(language_model)
lines = text.split("\n")
result: list[str] = []
for line in lines:
# Check if line is within limit
if len(line) <= max_length:
result.append(line)
continue
# Split the long line
split_lines = _split_at_boundaries(line, max_length, nlp)
result.extend(split_lines)
return result
[docs]
def split_text(
text: str,
mode: str = "sentence",
language_model: str = "en_core_web_sm",
apply_corrections: bool = True,
split_on_colon: bool = True,
) -> list[Segment]:
"""
Split text into segments with hierarchical position information.
This function provides a unified interface for text splitting with different
granularity levels, while preserving paragraph and sentence structure information.
Useful for audiobook generation where different pause lengths are needed
between paragraphs vs. sentences vs. clauses.
Args:
text: Input text to split
mode: Splitting mode - one of:
- "paragraph": Split into paragraphs only
- "sentence": Split into sentences, grouped by paragraph
- "clause": Split into clauses (comma-separated), with paragraph
and sentence info
language_model: spaCy language model to use (for sentence/clause modes)
apply_corrections: Whether to apply post-processing corrections for
common spaCy errors (URL splitting, abbreviation handling).
Default is True. Only applies to sentence/clause modes.
split_on_colon: Kept for API compatibility (currently unused).
spaCy's default colon behavior is used. Default is True.
Returns:
List of Segment namedtuples, each containing:
- text: The segment text
- paragraph: Paragraph index (0-based)
- sentence: Sentence index within paragraph (0-based).
None for paragraph mode.
Raises:
ValueError: If mode is not one of "paragraph", "sentence", "clause"
Example:
>>> segments = split_text("Hello world. How are you?\\n\\nNew paragraph.")
>>> for seg in segments:
... print(f"P{seg.paragraph} S{seg.sentence}: {seg.text}")
P0 S0: Hello world.
P0 S1: How are you?
P1 S0: New paragraph.
>>> # Detect paragraph changes for longer pauses
>>> for i, seg in enumerate(segments):
... if i > 0 and seg.paragraph != segments[i-1].paragraph:
... print("--- paragraph break ---")
... print(seg.text)
"""
valid_modes = ("paragraph", "sentence", "clause")
if mode not in valid_modes:
raise ValueError(f"mode must be one of {valid_modes}, got {mode!r}")
paragraphs = split_paragraphs(text)
if not paragraphs:
return []
result: list[Segment] = []
if mode == "paragraph":
for para_idx, para in enumerate(paragraphs):
result.append(Segment(text=para, paragraph=para_idx, sentence=None))
return result
# For sentence and clause modes, we need spaCy
nlp = _get_nlp(language_model)
for para_idx, para in enumerate(paragraphs):
# Protect ellipsis from being treated as sentence boundaries
protected_para = _protect_ellipsis(para)
# Process paragraph into sentences (handles long text)
sentences = _process_long_text(protected_para, nlp)
# Restore ellipsis in sentences
sentences = [_restore_ellipsis(sent) for sent in sentences]
# Apply post-processing corrections if enabled
if apply_corrections:
sentences = _apply_corrections(
sentences, language_model, split_on_colon, nlp
)
if mode == "sentence":
for sent_idx, sent in enumerate(sentences):
result.append(Segment(text=sent, paragraph=para_idx, sentence=sent_idx))
elif mode == "clause":
for sent_idx, sent in enumerate(sentences):
clauses = _split_sentence_into_clauses(sent)
for clause in clauses:
result.append(
Segment(text=clause, paragraph=para_idx, sentence=sent_idx)
)
return result