diff --git a/crates/typst/src/layout/inline/mod.rs b/crates/typst/src/layout/inline/mod.rs index 0b73eef6..f8b17f46 100644 --- a/crates/typst/src/layout/inline/mod.rs +++ b/crates/typst/src/layout/inline/mod.rs @@ -298,6 +298,19 @@ impl SpanMapper { } } +/// A dash at the end of a line. +#[derive(Debug, Copy, Clone, Eq, PartialEq)] +pub(super) enum Dash { + /// A hyphen added to break a word. + SoftHyphen, + /// Regular hyphen, present in a compound word, e.g. beija-flor. + HardHyphen, + /// An em dash. + Long, + /// An en dash. + Short, +} + /// A layouted line, consisting of a sequence of layouted paragraph items that /// are mostly borrowed from the preparation phase. This type enables you to /// measure the size of a line in a range before committing to building the @@ -327,7 +340,7 @@ struct Line<'a> { justify: bool, /// Whether the line ends with a hyphen or dash, either naturally or through /// hyphenation. - dash: bool, + dash: Option, } impl<'a> Line<'a> { @@ -814,8 +827,10 @@ fn linebreak_simple<'a>( let mut last = None; breakpoints(p, |end, breakpoint| { + let prepend_hyphen = lines.last().map(should_repeat_hyphen).unwrap_or(false); + // Compute the line and its size. - let mut attempt = line(engine, p, start..end, breakpoint); + let mut attempt = line(engine, p, start..end, breakpoint, prepend_hyphen); // If the line doesn't fit anymore, we push the last fitting attempt // into the stack and rebuild the line from the attempt's end. The @@ -824,7 +839,7 @@ fn linebreak_simple<'a>( if let Some((last_attempt, last_end)) = last.take() { lines.push(last_attempt); start = last_end; - attempt = line(engine, p, start..end, breakpoint); + attempt = line(engine, p, start..end, breakpoint, prepend_hyphen); } } @@ -894,7 +909,7 @@ fn linebreak_optimized<'a>( let mut table = vec![Entry { pred: 0, total: 0.0, - line: line(engine, p, 0..0, Breakpoint::Mandatory), + line: line(engine, p, 0..0, Breakpoint::Mandatory, false), }]; let em = p.size; @@ -908,8 +923,9 @@ fn linebreak_optimized<'a>( for (i, pred) in table.iter().enumerate().skip(active) { // Layout the line. let start = pred.line.end; + let prepend_hyphen = should_repeat_hyphen(&pred.line); - let attempt = line(engine, p, start..end, breakpoint); + let attempt = line(engine, p, start..end, breakpoint, prepend_hyphen); // Determine how much the line's spaces would need to be stretched // to make it the desired width. @@ -987,7 +1003,7 @@ fn linebreak_optimized<'a>( cost = (0.01 + cost).powi(2); // Penalize two consecutive dashes (not necessarily hyphens) extra. - if attempt.dash && pred.line.dash { + if attempt.dash.is_some() && pred.line.dash.is_some() { cost += CONSECUTIVE_DASH_COST; } @@ -1022,6 +1038,7 @@ fn line<'a>( p: &'a Preparation, mut range: Range, breakpoint: Breakpoint, + prepend_hyphen: bool, ) -> Line<'a> { let end = range.end; let mut justify = @@ -1037,7 +1054,7 @@ fn line<'a>( last: None, width: Abs::zero(), justify, - dash: false, + dash: None, }; } @@ -1047,7 +1064,7 @@ fn line<'a>( // Reshape the last item if it's split in half or hyphenated. let mut last = None; - let mut dash = false; + let mut dash = None; if let Some((Item::Text(shaped), before)) = inner.split_last() { // Compute the range we want to shape, trimming whitespace at the // end of the line. @@ -1062,7 +1079,17 @@ fn line<'a>( // Deal with hyphens, dashes and justification. let shy = trimmed.ends_with('\u{ad}'); let hyphen = breakpoint == Breakpoint::Hyphen; - dash = hyphen || shy || trimmed.ends_with(['-', '–', '—']); + dash = if hyphen || shy { + Some(Dash::SoftHyphen) + } else if trimmed.ends_with('-') { + Some(Dash::HardHyphen) + } else if trimmed.ends_with('–') { + Some(Dash::Short) + } else if trimmed.ends_with('—') { + Some(Dash::Long) + } else { + None + }; justify |= text.ends_with('\u{2028}'); // Deal with CJK punctuation at line ends. @@ -1079,7 +1106,11 @@ fn line<'a>( // need the shaped empty string to make the line the appropriate // height. That is the case exactly if the string is empty and there // are no other items in the line. - if hyphen || start + shaped.text.len() > range.end || maybe_adjust_last_glyph { + if hyphen + || start + shaped.text.len() > range.end + || maybe_adjust_last_glyph + || prepend_hyphen + { if hyphen || start < range.end || before.is_empty() { let mut reshaped = shaped.reshape(engine, &p.spans, start..range.end); if hyphen || shy { @@ -1131,7 +1162,10 @@ fn line<'a>( let end = range.end.min(base + shaped.text.len()); // Reshape if necessary. - if range.start + shaped.text.len() > end || maybe_adjust_first_glyph { + if range.start + shaped.text.len() > end + || maybe_adjust_first_glyph + || prepend_hyphen + { // If the range is empty, we don't want to push an empty text item. if range.start < end { let reshaped = shaped.reshape(engine, &p.spans, range.start..end); @@ -1143,6 +1177,15 @@ fn line<'a>( } } + if prepend_hyphen { + let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut); + if let Some(reshaped) = reshaped { + let width_before = reshaped.width; + reshaped.prepend_hyphen(engine, p.fallback); + width += reshaped.width - width_before; + } + } + if maybe_adjust_first_glyph { let reshaped = first.as_mut().or(last.as_mut()).and_then(Item::text_mut); if let Some(reshaped) = reshaped { @@ -1446,3 +1489,49 @@ fn overhang(c: char) -> f64 { _ => 0.0, } } + +/// Whether the hyphen should repeat at the start of the next line. +fn should_repeat_hyphen(pred_line: &Line) -> bool { + // If the predecessor line does not end with a Dash::HardHyphen, we shall + // not place a hyphen at the start of the next line. + if pred_line.dash != Some(Dash::HardHyphen) { + return false; + } + + // If there's a trimmed out space, we needn't repeat the hyphen. That's the + // case of a text like "...kebab é a -melhor- comida que existe", where the + // hyphens are a kind of emphasis marker. + if pred_line.trimmed.end != pred_line.end { + return false; + } + + // The hyphen should repeat only in the languages that require that feature. + // For more information see the discussion at https://github.com/typst/typst/issues/3235 + let Some(Item::Text(shape)) = pred_line.last.as_ref() else { return false }; + + match shape.lang { + // - Lower Sorbian: see https://dolnoserbski.de/ortografija/psawidla/K3 + // - Czech: see https://prirucka.ujc.cas.cz/?id=164 + // - Croatian: see http://pravopis.hr/pravilo/spojnica/68/ + // - Polish: see https://www.ortograf.pl/zasady-pisowni/lacznik-zasady-pisowni + // - Portuguese: see https://www2.senado.leg.br/bdsf/bitstream/handle/id/508145/000997415.pdf (Base XX) + // - Slovak: see https://www.zones.sk/studentske-prace/gramatika/10620-pravopis-rozdelovanie-slov/ + Lang::LOWER_SORBIAN + | Lang::CZECH + | Lang::CROATIAN + | Lang::POLISH + | Lang::PORTUGUESE + | Lang::SLOVAK => true, + // In Spanish the hyphen is required only if the word next to hyphen is + // not capitalized. Otherwise, the hyphen must not be repeated. + // + // See § 4.1.1.1.2.e on the "Ortografía de la lengua española" + // https://www.rae.es/ortografía/como-signo-de-división-de-palabras-a-final-de-línea + Lang::SPANISH => pred_line.bidi.text[pred_line.end..] + .chars() + .next() + .map(|c| !c.is_uppercase()) + .unwrap_or(false), + _ => false, + } +} diff --git a/crates/typst/src/layout/inline/shaping.rs b/crates/typst/src/layout/inline/shaping.rs index ff13f776..57b94230 100644 --- a/crates/typst/src/layout/inline/shaping.rs +++ b/crates/typst/src/layout/inline/shaping.rs @@ -447,6 +447,15 @@ impl<'a> ShapedText<'a> { /// Push a hyphen to end of the text. pub fn push_hyphen(&mut self, engine: &Engine, fallback: bool) { + self.insert_hyphen(engine, fallback, Side::Right) + } + + /// Prepend a hyphen to start of the text. + pub fn prepend_hyphen(&mut self, engine: &Engine, fallback: bool) { + self.insert_hyphen(engine, fallback, Side::Left) + } + + fn insert_hyphen(&mut self, engine: &Engine, fallback: bool, side: Side) { let world = engine.world; let book = world.book(); let fallback_func = if fallback { @@ -464,17 +473,17 @@ impl<'a> ShapedText<'a> { let ttf = font.ttf(); let glyph_id = ttf.glyph_index('-')?; let x_advance = font.to_em(ttf.glyph_hor_advance(glyph_id)?); - let range = self - .glyphs - .last() - .map(|g| g.range.end..g.range.end) - // In the unlikely chance that we hyphenate after an empty line, - // ensure that the glyph range still falls after self.base so - // that subtracting either of the endpoints by self.base doesn't - // underflow. See . - .unwrap_or_else(|| self.base..self.base); + let range = match side { + Side::Left => self.glyphs.first().map(|g| g.range.start..g.range.start), + Side::Right => self.glyphs.last().map(|g| g.range.end..g.range.end), + } + // In the unlikely chance that we hyphenate after an empty line, + // ensure that the glyph range still falls after self.base so + // that subtracting either of the endpoints by self.base doesn't + // underflow. See . + .unwrap_or_else(|| self.base..self.base); self.width += x_advance.at(self.size); - self.glyphs.to_mut().push(ShapedGlyph { + let glyph = ShapedGlyph { font, glyph_id: glyph_id.0, x_advance, @@ -487,7 +496,11 @@ impl<'a> ShapedText<'a> { span: (Span::detached(), 0), is_justifiable: false, script: Script::Common, - }); + }; + match side { + Side::Left => self.glyphs.to_mut().insert(0, glyph), + Side::Right => self.glyphs.to_mut().push(glyph), + } Some(()) }); } diff --git a/crates/typst/src/text/lang.rs b/crates/typst/src/text/lang.rs index 67df0c6e..6809238a 100644 --- a/crates/typst/src/text/lang.rs +++ b/crates/typst/src/text/lang.rs @@ -57,6 +57,7 @@ impl Lang { pub const BOKMÅL: Self = Self(*b"nb ", 2); pub const CATALAN: Self = Self(*b"ca ", 2); pub const CHINESE: Self = Self(*b"zh ", 2); + pub const CROATIAN: Self = Self(*b"hr ", 2); pub const CZECH: Self = Self(*b"cs ", 2); pub const DANISH: Self = Self(*b"da ", 2); pub const DUTCH: Self = Self(*b"nl ", 2); @@ -70,12 +71,14 @@ impl Lang { pub const HUNGARIAN: Self = Self(*b"hu ", 2); pub const ITALIAN: Self = Self(*b"it ", 2); pub const JAPANESE: Self = Self(*b"ja ", 2); + pub const LOWER_SORBIAN: Self = Self(*b"dsb", 3); pub const NYNORSK: Self = Self(*b"nn ", 2); pub const POLISH: Self = Self(*b"pl ", 2); pub const PORTUGUESE: Self = Self(*b"pt ", 2); pub const ROMANIAN: Self = Self(*b"ro ", 2); pub const RUSSIAN: Self = Self(*b"ru ", 2); pub const SERBIAN: Self = Self(*b"sr ", 2); + pub const SLOVAK: Self = Self(*b"sk ", 2); pub const SLOVENIAN: Self = Self(*b"sl ", 2); pub const SPANISH: Self = Self(*b"es ", 2); pub const SWEDISH: Self = Self(*b"sv ", 2); diff --git a/tests/ref/hyphenate-es-captalized-names.png b/tests/ref/hyphenate-es-captalized-names.png new file mode 100644 index 00000000..803d6795 Binary files /dev/null and b/tests/ref/hyphenate-es-captalized-names.png differ diff --git a/tests/ref/hyphenate-es-repeat-hyphen.png b/tests/ref/hyphenate-es-repeat-hyphen.png new file mode 100644 index 00000000..a4c5a060 Binary files /dev/null and b/tests/ref/hyphenate-es-repeat-hyphen.png differ diff --git a/tests/ref/hyphenate-pt-dash-emphasis.png b/tests/ref/hyphenate-pt-dash-emphasis.png new file mode 100644 index 00000000..cab13ea4 Binary files /dev/null and b/tests/ref/hyphenate-pt-dash-emphasis.png differ diff --git a/tests/ref/hyphenate-pt-no-repeat-hyphen.png b/tests/ref/hyphenate-pt-no-repeat-hyphen.png new file mode 100644 index 00000000..d0e34c9b Binary files /dev/null and b/tests/ref/hyphenate-pt-no-repeat-hyphen.png differ diff --git a/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis.png b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis.png new file mode 100644 index 00000000..0bb23ab1 Binary files /dev/null and b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis.png differ diff --git a/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.png b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.png new file mode 100644 index 00000000..d08859fb Binary files /dev/null and b/tests/ref/hyphenate-pt-repeat-hyphen-hyphenate-true.png differ diff --git a/tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.png b/tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.png new file mode 100644 index 00000000..d08859fb Binary files /dev/null and b/tests/ref/hyphenate-pt-repeat-hyphen-natural-word-breaking.png differ diff --git a/tests/suite/layout/inline/hyphenate.typ b/tests/suite/layout/inline/hyphenate.typ index bcad4d93..c366b38f 100644 --- a/tests/suite/layout/inline/hyphenate.typ +++ b/tests/suite/layout/inline/hyphenate.typ @@ -50,6 +50,58 @@ It's a #emph[Tree]beard. #set text(hyphenate: true) #h(6pt) networks, the rest. +--- hyphenate-pt-repeat-hyphen-natural-word-breaking --- +// The word breaker naturally breaks arco-da-velha at arco-/-da-velha, +// so we shall repeat the hyphen, even that hyphenate is set to false. +#set page(width: 4cm) +#set text(lang: "pt") + +Alguma coisa no arco-da-velha é algo que está muito longe. + +--- hyphenate-pt-repeat-hyphen-hyphenate-true --- +#set page(width: 4cm) +#set text(lang: "pt", hyphenate: true) + +Alguma coisa no arco-da-velha é algo que está muito longe. + +--- hyphenate-pt-repeat-hyphen-hyphenate-true-with-emphasis --- +#set page(width: 4cm) +#set text(lang: "pt", hyphenate: true) + +Alguma coisa no _arco-da-velha_ é algo que está muito longe. + +--- hyphenate-pt-no-repeat-hyphen --- +#set page(width: 4cm) +#set text(lang: "pt", hyphenate: true) + +Um médico otorrinolaringologista cuida da garganta do paciente. + +--- hyphenate-pt-dash-emphasis --- +// If the hyphen is followed by a space we shall not repeat the hyphen +// at the next line +#set page(width: 4cm) +#set text(lang: "pt", hyphenate: true) + +Quebabe é a -melhor- comida que existe. + +--- hyphenate-es-repeat-hyphen --- +#set page(width: 6cm) +#set text(lang: "es", hyphenate: true) + +Lo que entendemos por nivel léxico-semántico, en cuanto su sentido más +gramatical: es aquel que estudia el origen y forma de las palabras de +un idioma. + +--- hyphenate-es-captalized-names --- +// If the hyphen is followed by a capitalized word we shall not repeat +// the hyphen at the next line +#set page(width: 6.2cm) +#set text(lang: "es", hyphenate: true) + +Tras el estallido de la contienda Ruiz-Giménez fue detenido junto a sus +dos hermanos y puesto bajo custodia por las autoridades republicanas, con +el objetivo de protegerle de las patrullas de milicianos. + --- costs-widow-orphan --- #set page(height: 60pt)