Examples ======== This page provides practical examples of using phrasplit for various use cases. Audiobook Creation ------------------ Split text at natural pause points for text-to-speech processing: .. code-block:: python from phrasplit import split_sentences, split_clauses def prepare_for_tts(text): """Prepare text for text-to-speech with natural pauses.""" parts = [] for sentence in split_sentences(text): # Split long sentences at commas for natural pauses clauses = split_clauses(sentence) parts.extend(clauses) return parts text = """ When the sun rose over the mountains, the valley was filled with golden light. Birds began to sing their morning songs, and the world slowly awakened. """ parts = prepare_for_tts(text) for part in parts: print(part) # Each part can be sent to TTS with appropriate pauses between them Audiobook with Paragraph Awareness ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ For more control over pause lengths, use :func:`~phrasplit.split_text` to track paragraph and sentence boundaries: .. code-block:: python from phrasplit import split_text def create_audiobook_segments(text, mode="sentence"): """ Create audiobook segments with pause markers. Args: text: The text to process mode: "sentence" or "clause" for granularity Returns: List of (text, pause_type) tuples """ segments = split_text(text, mode=mode) result = [] for i, seg in enumerate(segments): if not seg.text.strip(): continue # Determine pause type based on structure change if i == 0: pause_type = "none" elif seg.paragraph != segments[i-1].paragraph: pause_type = "paragraph" # Long pause (e.g., 1.0s) elif seg.sentence != segments[i-1].sentence: pause_type = "sentence" # Medium pause (e.g., 0.5s) else: pause_type = "clause" # Short pause (e.g., 0.2s) result.append((seg.text, pause_type)) return result text = """ The adventure begins here. Our hero sets out on a journey. Many challenges lay ahead. But courage would see them through. """ segments = create_audiobook_segments(text, mode="clause") for text, pause in segments: print(f"[{pause:>10}] {text}") # Output: # [ none] The adventure begins here. # [ sentence] Our hero sets out on a journey. # [ paragraph] Many challenges lay ahead. # [ sentence] But courage would see them through. Complete Audiobook Processor ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ A full example integrating with a TTS system: .. code-block:: python from phrasplit import split_text, Segment class AudiobookProcessor: """Process text for audiobook generation.""" PAUSE_DURATIONS = { "paragraph": 1.0, "sentence": 0.5, "clause": 0.2, } def __init__(self, tts_engine): self.tts = tts_engine def process_chapter(self, text, mode="sentence"): """Process a chapter into audio segments.""" segments = split_text(text, mode=mode) segments = [s for s in segments if s.text.strip()] audio_segments = [] for i, seg in enumerate(segments): # Generate audio for text audio = self.tts.synthesize(seg.text) audio_segments.append(audio) # Add appropriate pause if i < len(segments) - 1: next_seg = segments[i + 1] if next_seg.paragraph != seg.paragraph: pause = self.PAUSE_DURATIONS["paragraph"] elif next_seg.sentence != seg.sentence: pause = self.PAUSE_DURATIONS["sentence"] else: pause = self.PAUSE_DURATIONS["clause"] audio_segments.append(self.tts.silence(pause)) return self.tts.concatenate(audio_segments) Subtitle Generation ------------------- Create subtitles that fit within character limits: .. code-block:: python from phrasplit import split_long_lines def create_subtitles(transcript, max_chars=42): """Create subtitles from transcript with length limits.""" lines = split_long_lines(transcript, max_length=max_chars) subtitles = [] for i, line in enumerate(lines, 1): subtitle = { "index": i, "text": line, "chars": len(line) } subtitles.append(subtitle) return subtitles transcript = """ This is a very long sentence that would not fit on a single subtitle line and needs to be broken up into smaller, more readable chunks for the viewer. """ subtitles = create_subtitles(transcript) for sub in subtitles: print(f"{sub['index']}: {sub['text']} ({sub['chars']} chars)") E-book Processing ----------------- Process an e-book into structured data: .. code-block:: python from phrasplit import split_paragraphs, split_sentences import json def process_ebook(text): """Convert e-book text to structured JSON.""" chapters = [] current_chapter = {"paragraphs": []} for para in split_paragraphs(text): # Detect chapter headers (simple example) if para.startswith("Chapter"): if current_chapter["paragraphs"]: chapters.append(current_chapter) current_chapter = { "title": para, "paragraphs": [] } else: sentences = split_sentences(para) current_chapter["paragraphs"].append({ "text": para, "sentences": sentences, "sentence_count": len(sentences) }) if current_chapter["paragraphs"]: chapters.append(current_chapter) return chapters # Example usage book_text = """ Chapter 1 It was the best of times. It was the worst of times. The city was alive with activity. People rushed through the streets. Chapter 2 A new day dawned. The adventure continued. """ structure = process_ebook(book_text) print(json.dumps(structure, indent=2)) Text Analysis ------------- Analyze text statistics: .. code-block:: python from phrasplit import split_paragraphs, split_sentences, split_clauses def analyze_text(text): """Generate text statistics.""" paragraphs = split_paragraphs(text) total_sentences = 0 total_clauses = 0 sentence_lengths = [] for para in paragraphs: sentences = split_sentences(para) total_sentences += len(sentences) for sent in sentences: sentence_lengths.append(len(sent)) clauses = split_clauses(sent) total_clauses += len(clauses) stats = { "paragraphs": len(paragraphs), "sentences": total_sentences, "clauses": total_clauses, "avg_sentence_length": sum(sentence_lengths) / len(sentence_lengths), "avg_sentences_per_paragraph": total_sentences / len(paragraphs), "avg_clauses_per_sentence": total_clauses / total_sentences, } return stats text = """ The quick brown fox jumps over the lazy dog. This sentence is shorter. Another paragraph here, with some clauses, and more content. Final sentence of the document. """ stats = analyze_text(text) for key, value in stats.items(): print(f"{key}: {value:.2f}" if isinstance(value, float) else f"{key}: {value}") Batch Processing ---------------- Process multiple files: .. code-block:: python from pathlib import Path from phrasplit import split_sentences def process_directory(input_dir, output_dir): """Process all text files in a directory.""" input_path = Path(input_dir) output_path = Path(output_dir) output_path.mkdir(exist_ok=True) for txt_file in input_path.glob("*.txt"): print(f"Processing {txt_file.name}...") text = txt_file.read_text(encoding="utf-8") sentences = split_sentences(text) output_file = output_path / txt_file.name output_file.write_text("\n".join(sentences), encoding="utf-8") print(f" -> {len(sentences)} sentences written to {output_file}") # Example usage # process_directory("./books", "./processed") Working with Different Languages -------------------------------- Use language-specific models: .. code-block:: python from phrasplit import split_sentences # German text german_text = "Guten Tag. Wie geht es Ihnen? Das Wetter ist schön." # First: python -m spacy download de_core_news_sm german_sentences = split_sentences(german_text, language_model="de_core_news_sm") # French text french_text = "Bonjour. Comment allez-vous? Il fait beau aujourd'hui." # First: python -m spacy download fr_core_news_sm french_sentences = split_sentences(french_text, language_model="fr_core_news_sm") # Spanish text spanish_text = "Hola. ¿Cómo estás? El tiempo es bueno." # First: python -m spacy download es_core_news_sm spanish_sentences = split_sentences(spanish_text, language_model="es_core_news_sm") Integration with pandas ----------------------- Process text data in DataFrames: .. code-block:: python import pandas as pd from phrasplit import split_sentences, split_clauses, split_text # Sample data data = { "id": [1, 2, 3], "text": [ "Hello world. How are you?", "The cat sat on the mat, and the dog barked.", "Dr. Smith arrived. He was late, unfortunately." ] } df = pd.DataFrame(data) # Add sentence count df["sentence_count"] = df["text"].apply(lambda x: len(split_sentences(x))) # Add clause count df["clause_count"] = df["text"].apply(lambda x: len(split_clauses(x))) # Explode into one row per sentence df_sentences = df.assign( sentence=df["text"].apply(split_sentences) ).explode("sentence") print(df_sentences) Using split_text with pandas ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ For more detailed analysis with structure information: .. code-block:: python import pandas as pd from phrasplit import split_text text = """First paragraph sentence one. Sentence two. Second paragraph here. Another sentence.""" # Convert segments to DataFrame segments = split_text(text, mode="sentence") df = pd.DataFrame([ {"text": s.text, "paragraph": s.paragraph, "sentence": s.sentence} for s in segments ]) print(df) # text paragraph sentence # 0 First paragraph sentence one. 0 0 # 1 Sentence two. 0 1 # 2 Second paragraph here. 1 0 # 3 Another sentence. 1 1 # Group by paragraph for para_id, group in df.groupby("paragraph"): print(f"\nParagraph {para_id}:") for _, row in group.iterrows(): print(f" S{row['sentence']}: {row['text']}")