From 129a4d600c1860beb7ec2ae52a9186088f6f043d Mon Sep 17 00:00:00 2001 From: Laurenz Date: Thu, 4 Jul 2024 15:27:43 +0200 Subject: [PATCH] Fix hyphenation outside of words (#4498) --- crates/typst/src/layout/inline/linebreak.rs | 88 +++++++++++--------- tests/ref/hyphenate-outside-of-words.png | Bin 0 -> 1011 bytes tests/suite/layout/inline/hyphenate.typ | 10 +++ 3 files changed, 57 insertions(+), 41 deletions(-) create mode 100644 tests/ref/hyphenate-outside-of-words.png diff --git a/crates/typst/src/layout/inline/linebreak.rs b/crates/typst/src/layout/inline/linebreak.rs index dbaa9c59..9deaa92a 100644 --- a/crates/typst/src/layout/inline/linebreak.rs +++ b/crates/typst/src/layout/inline/linebreak.rs @@ -1,6 +1,6 @@ use std::ops::{Add, Sub}; -use icu_properties::maps::CodePointMapData; +use icu_properties::maps::{CodePointMapData, CodePointMapDataBorrowed}; use icu_properties::sets::CodePointSetData; use icu_properties::LineBreak; use icu_provider::AsDeserializingBufferProvider; @@ -8,6 +8,7 @@ use icu_provider_adapters::fork::ForkByKeyProvider; use icu_provider_blob::BlobDataProvider; use icu_segmenter::LineSegmenter; use once_cell::sync::Lazy; +use unicode_segmentation::UnicodeSegmentation; use super::*; use crate::engine::Engine; @@ -630,7 +631,7 @@ fn raw_cost( /// This is an internal instead of an external iterator because it makes the /// code much simpler and the consumers of this function don't need the /// composability and flexibility of external iteration anyway. -fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint)) { +fn breakpoints(p: &Preparation, mut f: impl FnMut(usize, Breakpoint)) { let text = p.text; // Single breakpoint at the end for empty text. @@ -661,7 +662,7 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint)) } } - // Get the UAX #14 linebreak opportunities. + // Get the next UAX #14 linebreak opportunity. let Some(point) = iter.next() else { break }; // Skip breakpoint if there is no char before it. icu4x generates one @@ -686,46 +687,13 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint)) }; // Hyphenate between the last and current breakpoint. - 'hyphenate: { - if !hyphenate { - break 'hyphenate; - } - - // Extract a hyphenatable "word". - let word = &text[last..point].trim_end_matches(|c: char| !c.is_alphabetic()); - if word.is_empty() { - break 'hyphenate; - } - - let end = last + word.len(); + if hyphenate { let mut offset = last; - - // Determine the language to hyphenate this word in. - let Some(lang) = lang_at(p, last) else { break 'hyphenate }; - - for syllable in hypher::hyphenate(word, lang) { - // Don't hyphenate after the final syllable. - offset += syllable.len(); - if offset == end { - continue; + for segment in text[last..point].split_word_bounds() { + if !segment.is_empty() && segment.chars().all(char::is_alphabetic) { + hyphenations(p, &lb, offset, segment, &mut f); } - - // Filter out hyphenation opportunities where hyphenation was - // actually disabled. - if !hyphenate_at(p, offset) { - continue; - } - - // Filter out forbidden hyphenation opportunities. - if matches!( - syllable.chars().next_back().map(|c| lb.get(c)), - Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ) - ) { - continue; - } - - // Call `f` for the word-internal hyphenation opportunity. - f(offset, Breakpoint::Hyphen); + offset += segment.len(); } } @@ -736,6 +704,44 @@ fn breakpoints<'a>(p: &'a Preparation<'a>, mut f: impl FnMut(usize, Breakpoint)) } } +/// Generate breakpoints for hyphenations within a word. +fn hyphenations( + p: &Preparation, + lb: &CodePointMapDataBorrowed, + mut offset: usize, + word: &str, + mut f: impl FnMut(usize, Breakpoint), +) { + let Some(lang) = lang_at(p, offset) else { return }; + let end = offset + word.len(); + + for syllable in hypher::hyphenate(word, lang) { + offset += syllable.len(); + + // Don't hyphenate after the final syllable. + if offset == end { + continue; + } + + // Filter out hyphenation opportunities where hyphenation was actually + // disabled. + if !hyphenate_at(p, offset) { + continue; + } + + // Filter out forbidden hyphenation opportunities. + if matches!( + syllable.chars().next_back().map(|c| lb.get(c)), + Some(LineBreak::Glue | LineBreak::WordJoiner | LineBreak::ZWJ) + ) { + continue; + } + + // Call `f` for the word-internal hyphenation opportunity. + f(offset, Breakpoint::Hyphen); + } +} + /// Produce linebreak opportunities for a link. fn linebreak_link(link: &str, mut f: impl FnMut(usize)) { #[derive(PartialEq)] diff --git a/tests/ref/hyphenate-outside-of-words.png b/tests/ref/hyphenate-outside-of-words.png new file mode 100644 index 0000000000000000000000000000000000000000..57b11ed84222d1dd8a66e118fb319069efce7c07 GIT binary patch literal 1011 zcmVQhYF`8{b3Gu=pU}Q^F3Y!Y(a4-`yL2xj@wKJSsCj$u^D?&MS3{qe~ z={A_0tgPd>L5gxHgo5BW&2cP+K5vaTF7##>efkjdy#Ik$@=Ly--ytOGkPOSP4FA92 z?Y4P|;LtAI)Az9~GEAZw_e#dvlf;D!3XYo5>ePGGSLb7{h+B)el^M&Tkbc}KCS0Q< z-fT*vSkoWT;WHI6{E*ZF0+yjY6*1wsW)jWml?+4{CeO{fCIS<=)QVzGbht3qrKoN@ zP2;r=zr81$I{eozG;w?qy5cv*gqC(z5gY%R=f&=?j@1GM$Sj zgKkxq!l8+JG^hMk!S<^jhLal!YXNdD+fGrZ9t^5FZy7S}zEzz;@C@tR5Xv61v z!+e%{Ikxp3Xcfh!HW7N8{)4#H3wK`x47W$kpyu;$a zF8L##hQ~{JjuvY1G|VQ?(YCzRGAzR~j61v}5j<^>1{R_PY2fx`iVVf7t$~ZiQlU<3 z^BSD3qy3%TTs4Qgf6u%#7$ht$xx4}=HDHNj>RsLdzuMOOHl=dOEAUSmEV11Gme)X0 z!=~4sdf^NfKfQX(99YeC`1fY7!F)GviIe1xUCl~gyA&|2t~%sTLJIh)RB++66mVA_ zo2R`5*I%8YqkL*M?GJS)31MwSjd}&XmuK&-seS4v1eC43PMrpFio61^1SD>Ss`Yh* z&GF{HaLEr|fqy}zjnU#FWQHTuWkHI;E3ocL>e$Q&SDP@V$5QJr)fNoDm(~`GhBis^91>FTV+9!D(Toh5SCzVN6Z9wQ?QtVJ2{E hM&4=}mf-^r{|!Jnes$8Jf@%N&002ovPDHLkV1kuJ<5vIx literal 0 HcmV?d00001 diff --git a/tests/suite/layout/inline/hyphenate.typ b/tests/suite/layout/inline/hyphenate.typ index c366b38f..debce1da 100644 --- a/tests/suite/layout/inline/hyphenate.typ +++ b/tests/suite/layout/inline/hyphenate.typ @@ -50,6 +50,16 @@ It's a #emph[Tree]beard. #set text(hyphenate: true) #h(6pt) networks, the rest. +--- hyphenate-outside-of-words --- +// More tests for hyphenation of non-words. +#set text(hyphenate: true) +#block(width: 0pt, "doesn't") +#block(width: 0pt, "(OneNote)") +#block(width: 0pt, "(present)") + +#set text(lang: "de") +#block(width: 0pt, "(bzw.)") + --- hyphenate-pt-repeat-hyphen-natural-word-breaking --- // The word breaker naturally breaks arco-da-velha at arco-/-da-velha, // so we shall repeat the hyphen, even that hyphenate is set to false.