Examples
This page provides practical examples of using phrasplit for various use cases.
Audiobook Creation
Split text at natural pause points for text-to-speech processing:
from phrasplit import split_sentences, split_clauses
def prepare_for_tts(text):
"""Prepare text for text-to-speech with natural pauses."""
parts = []
for sentence in split_sentences(text):
# Split long sentences at commas for natural pauses
clauses = split_clauses(sentence)
parts.extend(clauses)
return parts
text = """
When the sun rose over the mountains, the valley was filled with golden light.
Birds began to sing their morning songs, and the world slowly awakened.
"""
parts = prepare_for_tts(text)
for part in parts:
print(part)
# Each part can be sent to TTS with appropriate pauses between them
Audiobook with Paragraph Awareness
For more control over pause lengths, use split_text() to track
paragraph and sentence boundaries:
from phrasplit import split_text
def create_audiobook_segments(text, mode="sentence"):
"""
Create audiobook segments with pause markers.
Args:
text: The text to process
mode: "sentence" or "clause" for granularity
Returns:
List of (text, pause_type) tuples
"""
segments = split_text(text, mode=mode)
result = []
for i, seg in enumerate(segments):
if not seg.text.strip():
continue
# Determine pause type based on structure change
if i == 0:
pause_type = "none"
elif seg.paragraph != segments[i-1].paragraph:
pause_type = "paragraph" # Long pause (e.g., 1.0s)
elif seg.sentence != segments[i-1].sentence:
pause_type = "sentence" # Medium pause (e.g., 0.5s)
else:
pause_type = "clause" # Short pause (e.g., 0.2s)
result.append((seg.text, pause_type))
return result
text = """
The adventure begins here. Our hero sets out on a journey.
Many challenges lay ahead. But courage would see them through.
"""
segments = create_audiobook_segments(text, mode="clause")
for text, pause in segments:
print(f"[{pause:>10}] {text}")
# Output:
# [ none] The adventure begins here.
# [ sentence] Our hero sets out on a journey.
# [ paragraph] Many challenges lay ahead.
# [ sentence] But courage would see them through.
Complete Audiobook Processor
A full example integrating with a TTS system:
from phrasplit import split_text, Segment
class AudiobookProcessor:
"""Process text for audiobook generation."""
PAUSE_DURATIONS = {
"paragraph": 1.0,
"sentence": 0.5,
"clause": 0.2,
}
def __init__(self, tts_engine):
self.tts = tts_engine
def process_chapter(self, text, mode="sentence"):
"""Process a chapter into audio segments."""
segments = split_text(text, mode=mode)
segments = [s for s in segments if s.text.strip()]
audio_segments = []
for i, seg in enumerate(segments):
# Generate audio for text
audio = self.tts.synthesize(seg.text)
audio_segments.append(audio)
# Add appropriate pause
if i < len(segments) - 1:
next_seg = segments[i + 1]
if next_seg.paragraph != seg.paragraph:
pause = self.PAUSE_DURATIONS["paragraph"]
elif next_seg.sentence != seg.sentence:
pause = self.PAUSE_DURATIONS["sentence"]
else:
pause = self.PAUSE_DURATIONS["clause"]
audio_segments.append(self.tts.silence(pause))
return self.tts.concatenate(audio_segments)
Subtitle Generation
Create subtitles that fit within character limits:
from phrasplit import split_long_lines
def create_subtitles(transcript, max_chars=42):
"""Create subtitles from transcript with length limits."""
lines = split_long_lines(transcript, max_length=max_chars)
subtitles = []
for i, line in enumerate(lines, 1):
subtitle = {
"index": i,
"text": line,
"chars": len(line)
}
subtitles.append(subtitle)
return subtitles
transcript = """
This is a very long sentence that would not fit on a single subtitle line
and needs to be broken up into smaller, more readable chunks for the viewer.
"""
subtitles = create_subtitles(transcript)
for sub in subtitles:
print(f"{sub['index']}: {sub['text']} ({sub['chars']} chars)")
E-book Processing
Process an e-book into structured data:
from phrasplit import split_paragraphs, split_sentences
import json
def process_ebook(text):
"""Convert e-book text to structured JSON."""
chapters = []
current_chapter = {"paragraphs": []}
for para in split_paragraphs(text):
# Detect chapter headers (simple example)
if para.startswith("Chapter"):
if current_chapter["paragraphs"]:
chapters.append(current_chapter)
current_chapter = {
"title": para,
"paragraphs": []
}
else:
sentences = split_sentences(para)
current_chapter["paragraphs"].append({
"text": para,
"sentences": sentences,
"sentence_count": len(sentences)
})
if current_chapter["paragraphs"]:
chapters.append(current_chapter)
return chapters
# Example usage
book_text = """
Chapter 1
It was the best of times. It was the worst of times.
The city was alive with activity. People rushed through the streets.
Chapter 2
A new day dawned. The adventure continued.
"""
structure = process_ebook(book_text)
print(json.dumps(structure, indent=2))
Text Analysis
Analyze text statistics:
from phrasplit import split_paragraphs, split_sentences, split_clauses
def analyze_text(text):
"""Generate text statistics."""
paragraphs = split_paragraphs(text)
total_sentences = 0
total_clauses = 0
sentence_lengths = []
for para in paragraphs:
sentences = split_sentences(para)
total_sentences += len(sentences)
for sent in sentences:
sentence_lengths.append(len(sent))
clauses = split_clauses(sent)
total_clauses += len(clauses)
stats = {
"paragraphs": len(paragraphs),
"sentences": total_sentences,
"clauses": total_clauses,
"avg_sentence_length": sum(sentence_lengths) / len(sentence_lengths),
"avg_sentences_per_paragraph": total_sentences / len(paragraphs),
"avg_clauses_per_sentence": total_clauses / total_sentences,
}
return stats
text = """
The quick brown fox jumps over the lazy dog. This sentence is shorter.
Another paragraph here, with some clauses, and more content.
Final sentence of the document.
"""
stats = analyze_text(text)
for key, value in stats.items():
print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}")
Batch Processing
Process multiple files:
from pathlib import Path
from phrasplit import split_sentences
def process_directory(input_dir, output_dir):
"""Process all text files in a directory."""
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(exist_ok=True)
for txt_file in input_path.glob("*.txt"):
print(f"Processing {txt_file.name}...")
text = txt_file.read_text(encoding="utf-8")
sentences = split_sentences(text)
output_file = output_path / txt_file.name
output_file.write_text("\n".join(sentences), encoding="utf-8")
print(f" -> {len(sentences)} sentences written to {output_file}")
# Example usage
# process_directory("./books", "./processed")
Working with Different Languages
Use language-specific models:
from phrasplit import split_sentences
# German text
german_text = "Guten Tag. Wie geht es Ihnen? Das Wetter ist schön."
# First: python -m spacy download de_core_news_sm
german_sentences = split_sentences(german_text, language_model="de_core_news_sm")
# French text
french_text = "Bonjour. Comment allez-vous? Il fait beau aujourd'hui."
# First: python -m spacy download fr_core_news_sm
french_sentences = split_sentences(french_text, language_model="fr_core_news_sm")
# Spanish text
spanish_text = "Hola. ¿Cómo estás? El tiempo es bueno."
# First: python -m spacy download es_core_news_sm
spanish_sentences = split_sentences(spanish_text, language_model="es_core_news_sm")
Integration with pandas
Process text data in DataFrames:
import pandas as pd
from phrasplit import split_sentences, split_clauses, split_text
# Sample data
data = {
"id": [1, 2, 3],
"text": [
"Hello world. How are you?",
"The cat sat on the mat, and the dog barked.",
"Dr. Smith arrived. He was late, unfortunately."
]
}
df = pd.DataFrame(data)
# Add sentence count
df["sentence_count"] = df["text"].apply(lambda x: len(split_sentences(x)))
# Add clause count
df["clause_count"] = df["text"].apply(lambda x: len(split_clauses(x)))
# Explode into one row per sentence
df_sentences = df.assign(
sentence=df["text"].apply(split_sentences)
).explode("sentence")
print(df_sentences)
Using split_text with pandas
For more detailed analysis with structure information:
import pandas as pd
from phrasplit import split_text
text = """First paragraph sentence one. Sentence two.
Second paragraph here. Another sentence."""
# Convert segments to DataFrame
segments = split_text(text, mode="sentence")
df = pd.DataFrame([
{"text": s.text, "paragraph": s.paragraph, "sentence": s.sentence}
for s in segments
])
print(df)
# text paragraph sentence
# 0 First paragraph sentence one. 0 0
# 1 Sentence two. 0 1
# 2 Second paragraph here. 1 0
# 3 Another sentence. 1 1
# Group by paragraph
for para_id, group in df.groupby("paragraph"):
print(f"\nParagraph {para_id}:")
for _, row in group.iterrows():
print(f" S{row['sentence']}: {row['text']}")