blob: e7c5be8cdb92215411143df6b4799d257e4faeeb [file] [log] [blame] [edit]
// Copyright 2023 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//! Helpers for translating `mdbook` projects.
//!
//! The functions here are used to implement a robust
//! internationalization (i18n) workflow for `mdbook`. This allows you
//! to translate your books into other languages while also making it
//! easy to keep the translations up to date as you edit the original
//! source text.
//!
//! See <https://github.com/google/mdbook-i18n-helpers> for details on
//! how to use the supplied `mdbook` plugins.
use polib::catalog::Catalog;
use pulldown_cmark::{
BrokenLinkCallback, CodeBlockKind, DefaultBrokenLinkCallback, Event, LinkType, Tag, TagEnd,
};
use pulldown_cmark_to_cmark::{
calculate_code_block_token_count, cmark_resume_with_options, Options, State,
};
use std::sync::OnceLock;
use syntect::easy::ScopeRangeIterator;
use syntect::parsing::{ParseState, Scope, ScopeStack, SyntaxSet};
pub mod directives;
pub mod gettext;
pub mod normalize;
pub mod preprocessors;
pub mod xgettext;
/// Re-wrap the sources field of a message.
///
/// This function tries to wrap the `file:lineno` pairs so they look
/// the same as what you get from `msgcat` or `msgmerge`.
pub fn wrap_sources(sources: &str) -> String {
let options = textwrap::Options::new(76)
.break_words(false)
.word_splitter(textwrap::WordSplitter::NoHyphenation);
textwrap::refill(sources, options)
}
/// Like `mdbook::utils::new_cmark_parser`, but also passes a
/// `BrokenLinkCallback`.
pub fn new_cmark_parser<'input, F: BrokenLinkCallback<'input>>(
text: &'input str,
broken_link_callback: Option<F>,
) -> pulldown_cmark::Parser<'input, F> {
let mut options = pulldown_cmark::Options::empty();
options.insert(pulldown_cmark::Options::ENABLE_TABLES);
options.insert(pulldown_cmark::Options::ENABLE_OLD_FOOTNOTES);
options.insert(pulldown_cmark::Options::ENABLE_STRIKETHROUGH);
options.insert(pulldown_cmark::Options::ENABLE_TASKLISTS);
options.insert(pulldown_cmark::Options::ENABLE_HEADING_ATTRIBUTES);
pulldown_cmark::Parser::new_with_broken_link_callback(text, options, broken_link_callback)
}
/// Extract Markdown events from `text`.
///
/// The `state` can be used to give the parsing context. In
/// particular, if a code block has started, the text should be parsed
/// without interpreting special Markdown characters.
///
/// The events are labeled with the line number where they start in
/// the document.
///
/// # Examples
///
/// ```
/// use mdbook_i18n_helpers::extract_events;
/// use pulldown_cmark::{Event, Tag, TagEnd};
///
/// assert_eq!(
/// extract_events("Hello,\nworld!", None),
/// vec![
/// (1, Event::Start(Tag::Paragraph)),
/// (1, Event::Text("Hello,".into())),
/// (1, Event::Text(" ".into())),
/// (2, Event::Text("world!".into())),
/// (1, Event::End(TagEnd::Paragraph)),
/// ]
/// );
/// ```
pub fn extract_events<'a>(text: &'a str, state: Option<State<'a>>) -> Vec<(usize, Event<'a>)> {
// Expand a `[foo]` style link into `[foo][foo]`.
fn expand_shortcut_link(tag: Tag<'_>) -> Tag<'_> {
match tag {
Tag::Link {
link_type: LinkType::Shortcut,
dest_url,
title,
id,
} => Tag::Link {
link_type: LinkType::Reference,
dest_url,
title,
id,
},
Tag::Image {
link_type: LinkType::Shortcut,
dest_url,
title,
id,
} => Tag::Image {
link_type: LinkType::Reference,
dest_url,
title,
id,
},
_ => tag,
}
}
// Offsets of each newline in the input, used to calculate line
// numbers from byte offsets.
let offsets = text
.match_indices('\n')
.map(|(offset, _)| offset)
.collect::<Vec<_>>();
match state {
// If we're in a code block, we disable the normal parsing and
// return lines of text. This matches the behavior of the
// parser in this case.
Some(state) if state.is_in_code_block => text
.split_inclusive('\n')
.enumerate()
.map(|(idx, line)| (idx + 1, Event::Text(line.into())))
.collect(),
// Otherwise, we parse the text line normally.
_ => new_cmark_parser::<'a, DefaultBrokenLinkCallback>(text, None)
.into_offset_iter()
.map(|(event, range)| {
let lineno = offsets.partition_point(|&o| o < range.start) + 1;
let event = match event {
Event::SoftBreak => Event::Text(" ".into()),
// Shortcut links like "[foo]" end up as "[foo]"
// in output. By changing them to a reference
// link, the link is expanded on the fly and the
// output becomes self-contained.
Event::Start(tag @ (Tag::Link { .. } | Tag::Image { .. })) => {
Event::Start(expand_shortcut_link(tag))
}
_ => event,
};
(lineno, event)
})
.collect(),
}
}
/// Markdown events grouped by type.
#[derive(Debug, Clone, PartialEq)]
pub enum Group<'a> {
/// Markdown events which should be translated.
///
/// This includes `[Text("foo")]` as well as sequences with text
/// such as `[Start(Emphasis), Text("foo") End(Emphasis)]`.
Translate {
events: Vec<(usize, Event<'a>)>,
/// A comment that may be associated with the translation text.
comment: String,
},
/// Markdown events which should be skipped when translating.
///
/// This includes structural events such as `Start(Heading(H1,
/// None, vec![]))`.
Skip(Vec<(usize, Event<'a>)>),
}
#[derive(Debug, Default)]
struct GroupingContext {
skip_next_group: bool,
comments: Vec<String>,
}
impl GroupingContext {
fn clear_skip_next_group(self) -> Self {
Self {
skip_next_group: false,
..self
}
}
}
/// Group Markdown events into translatable and skipped events.
///
/// This function will partition the input events into groups of
/// events which should be translated or skipped. Concatenating the
/// events in each group will give you back the original events.
///
/// # Examples
///
/// ```
/// use mdbook_i18n_helpers::{extract_events, group_events, Group};
/// use pulldown_cmark::{Event, Tag, TagEnd};
///
/// let events = extract_events("- A list item.", None);
/// assert_eq!(
/// events,
/// vec![
/// (1, Event::Start(Tag::List(None))),
/// (1, Event::Start(Tag::Item)),
/// (1, Event::Text("A list item.".into())),
/// (1, Event::End(TagEnd::Item)),
/// (1, Event::End(TagEnd::List(false))),
/// ],
/// );
///
/// let groups = group_events(&events);
/// assert_eq!(
/// groups,
/// vec![
/// Group::Skip(vec![
/// (1, Event::Start(Tag::List(None))),
/// (1, Event::Start(Tag::Item)),
/// ]),
/// Group::Translate {
/// events: vec![
/// (1, Event::Text("A list item.".into())),
/// ], comment: "".into()},
/// Group::Skip(vec![
/// (1, Event::End(TagEnd::Item)),
/// (1, Event::End(TagEnd::List(false))),
/// ]),
/// ]
/// );
/// ```
pub fn group_events<'a>(events: &'a [(usize, Event<'a>)]) -> Vec<Group<'a>> {
#[derive(Debug)]
enum State {
Translate(usize),
Skip(usize),
}
impl State {
/// Creates groups based on the capturing state and context.
fn into_groups<'a>(
self,
idx: usize,
events: &'a [(usize, Event<'a>)],
mut ctx: GroupingContext,
) -> (Vec<Group<'a>>, GroupingContext) {
match self {
State::Translate(start) => {
if ctx.skip_next_group {
(
vec![Group::Skip(events[start..idx].into())],
ctx.clear_skip_next_group(),
)
} else if is_codeblock_group(&events[start..idx]) {
parse_codeblock(&events[start..idx], ctx)
} else {
(
vec![Group::Translate {
events: events[start..idx].into(),
comment: std::mem::take(&mut ctx.comments).join(" "),
}],
ctx,
)
}
}
State::Skip(start) => (vec![Group::Skip(events[start..idx].into())], ctx),
}
}
}
let mut groups = Vec::new();
let mut state = State::Skip(0);
let mut ctx = GroupingContext::default();
for (idx, (_, event)) in events.iter().enumerate() {
match event {
// These block-level events force new groups. We do this
// because we want to include these events in the group to
// make the group self-contained.
Event::Start(Tag::Paragraph | Tag::CodeBlock(..)) => {
// A translatable group starts here.
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
state = State::Translate(idx);
}
Event::End(TagEnd::Paragraph | TagEnd::CodeBlock) => {
// A translatable group ends after `idx`.
let idx = idx + 1;
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
state = State::Skip(idx);
}
// Inline events start or continue a translating group.
Event::Start(
Tag::Emphasis
| Tag::Strong
| Tag::Strikethrough
| Tag::Link { .. }
| Tag::Image { .. },
)
| Event::End(
TagEnd::Emphasis
| TagEnd::Strong
| TagEnd::Strikethrough
| TagEnd::Link
| TagEnd::Image,
)
| Event::Text(_)
| Event::Code(_)
| Event::FootnoteReference(_)
| Event::SoftBreak
| Event::HardBreak => {
// If we're currently skipping, then a new
// translatable group starts here.
if let State::Skip(_) = state {
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
state = State::Translate(idx);
}
}
Event::Html(s) | Event::InlineHtml(s) => {
match directives::find(s) {
Some(directives::Directive::Skip) => {
// If in the middle of translation, finish it.
if let State::Translate(_) = state {
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
// Restart translation: subtle but should be
// needed to handle the skipping of the rest of
// the inlined content.
state = State::Translate(idx);
}
ctx.skip_next_group = true;
}
Some(directives::Directive::Comment(comment)) => {
// If in the middle of translation, finish it.
if let State::Translate(_) = state {
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
// Restart translation: subtle but should be
// needed to handle the skipping of the rest of
// the inlined content.
state = State::Translate(idx);
}
ctx.comments.push(comment);
}
_ => {
match event {
Event::Html(_) => {
// Otherwise, treat as a skipping group if this is a block level Html tag
if let State::Translate(_) = state {
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
state = State::Skip(idx);
}
}
Event::InlineHtml(_) =>
// If we're currently skipping, then a new
// translatable group starts here.
{
if let State::Skip(_) = state {
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
state = State::Translate(idx);
}
}
// this code is inside a match of Event::{Html|InlineHtml}, other types are not possible
_ => unreachable!(),
}
}
}
}
// All other block-level events start or continue a
// skipping group.
_ => {
if let State::Translate(_) = state {
let mut next_groups;
(next_groups, ctx) = state.into_groups(idx, events, ctx);
groups.append(&mut next_groups);
state = State::Skip(idx);
}
}
}
}
match state {
State::Translate(start) => groups.push(Group::Translate {
events: events[start..].into(),
comment: "".into(),
}),
State::Skip(start) => groups.push(Group::Skip(events[start..].into())),
}
groups
}
/// Returns true if the events appear to be a codeblock.
fn is_codeblock_group(events: &[(usize, Event<'_>)]) -> bool {
matches!(
events,
[
(_, Event::Start(Tag::CodeBlock(_))),
..,
(_, Event::End(TagEnd::CodeBlock))
]
)
}
/// Returns true if the scope should be translated.
fn is_translate_scope(x: Scope) -> bool {
static SCOPE_STRING: OnceLock<Scope> = OnceLock::new();
static SCOPE_COMMENT: OnceLock<Scope> = OnceLock::new();
let scope_string = SCOPE_STRING.get_or_init(|| Scope::new("string").unwrap());
let scope_comment = SCOPE_COMMENT.get_or_init(|| Scope::new("comment").unwrap());
scope_string.is_prefix_of(x) || scope_comment.is_prefix_of(x)
}
/// Creates groups by checking codeblock with heuristic way.
fn heuristic_codeblock<'a>(
events: &'a [(usize, Event<'_>)],
mut ctx: GroupingContext,
) -> (Vec<Group<'a>>, GroupingContext) {
let is_translate = match events {
[(_, Event::Start(Tag::CodeBlock(_))), .., (_, Event::End(TagEnd::CodeBlock))] => {
let (codeblock_text, _) = reconstruct_markdown(events, None);
// Heuristic to check whether the codeblock nether has a
// literal string nor a line comment. We may actually
// want to use a lexer here to make this more robust.
codeblock_text.contains('"') || codeblock_text.contains("//")
}
_ => true,
};
if is_translate {
(
vec![Group::Translate {
events: events.into(),
comment: std::mem::take(&mut ctx.comments).join(" "),
}],
ctx,
)
} else {
(vec![Group::Skip(events.into())], ctx)
}
}
/// Creates groups by parsing codeblock.
fn parse_codeblock<'a>(
events: &'a [(usize, Event<'_>)],
mut ctx: GroupingContext,
) -> (Vec<Group<'a>>, GroupingContext) {
// Language detection from language identifier of codeblock.
static SYNTAX_SET: OnceLock<SyntaxSet> = OnceLock::new();
let ss = SYNTAX_SET.get_or_init(SyntaxSet::load_defaults_newlines);
let syntax = if let (_, Event::Start(Tag::CodeBlock(CodeBlockKind::Fenced(x)))) = &events[0] {
ss.find_syntax_by_token(x.split(',').next().unwrap())
} else {
None
};
let Some(syntax) = syntax else {
// If there is no language specifier, falling back to heuristic way.
return heuristic_codeblock(events, ctx);
};
let mut ps = ParseState::new(syntax);
let mut ret = vec![];
for (idx, event) in events.iter().enumerate() {
match event {
(text_line, Event::Text(text)) => {
let mut stack = ScopeStack::new();
let mut stack_failure = false;
let Ok(ops) = ps.parse_line(text, ss) else {
// If parse is failed, the text event should be translated.
ret.push(Group::Translate {
events: events[idx..idx + 1].into(),
comment: std::mem::take(&mut ctx.comments).join(" "),
});
continue;
};
let mut translate_events = vec![];
let mut groups = vec![];
for (range, op) in ScopeRangeIterator::new(&ops, text) {
if stack.apply(op).is_err() {
stack_failure = true;
break;
}
if range.is_empty() {
continue;
}
// Calculate line number of the range
let range_line = if range.start == 0 {
*text_line
} else {
text_line + text[0..range.start].lines().count() - 1
};
let text = &text[range];
// Whitespaces between translate texts should be added to translate
// group.
// So all whitespaces are added to the translate events buffer temporary,
// and the trailing whitespaces will be remvoed finally.
let is_whitespace = text.trim_matches(&[' ', '\t'] as &[_]).is_empty();
let is_translate = stack.scopes.iter().any(|x| is_translate_scope(*x));
if is_translate || (is_whitespace && !translate_events.is_empty()) {
translate_events.push((range_line, Event::Text(text.into())));
} else {
let whitespace_events = extract_trailing_whitespaces(&mut translate_events);
if !translate_events.is_empty() {
groups.push(Group::Translate {
events: std::mem::take(&mut translate_events),
comment: std::mem::take(&mut ctx.comments).join(" "),
});
}
if !whitespace_events.is_empty() {
groups.push(Group::Skip(whitespace_events));
}
groups.push(Group::Skip(vec![(range_line, Event::Text(text.into()))]));
}
}
let whitespace_events = extract_trailing_whitespaces(&mut translate_events);
if !translate_events.is_empty() {
groups.push(Group::Translate {
events: std::mem::take(&mut translate_events),
comment: std::mem::take(&mut ctx.comments).join(" "),
});
}
if !whitespace_events.is_empty() {
groups.push(Group::Skip(whitespace_events));
}
if stack_failure {
// If stack operation is failed, the text event should be translated.
ret.push(Group::Translate {
events: events[idx..idx + 1].into(),
comment: std::mem::take(&mut ctx.comments).join(" "),
});
} else {
ret.append(&mut groups);
}
}
_ => {
ret.push(Group::Skip(events[idx..idx + 1].into()));
}
}
}
(ret, ctx)
}
/// Extract trailing events which have whitespace only.
fn extract_trailing_whitespaces<'a>(buf: &mut Vec<(usize, Event<'a>)>) -> Vec<(usize, Event<'a>)> {
let mut ret = vec![];
while let Some(last) = buf.last() {
match &last.1 {
Event::Text(text) if text.as_ref().trim_matches(&[' ', '\t'] as &[_]).is_empty() => {
let last = buf.pop().unwrap();
ret.push(last);
}
_ => break,
}
}
ret.reverse();
ret
}
/// Render a slice of Markdown events back to Markdown.
///
/// # Examples
///
/// ```
/// use mdbook_i18n_helpers::{extract_events, reconstruct_markdown};
/// use pulldown_cmark::{Event, Tag};
///
/// let group = extract_events("Hello *world!*", None);
/// let (reconstructed, _) = reconstruct_markdown(&group, None);
/// assert_eq!(reconstructed, "Hello _world!_");
/// ```
///
/// Notice how this will normalize the Markdown to use `_` for
/// emphasis and `**` for strong emphasis. The style is chosen to
/// match the [Google developer documentation style
/// guide](https://developers.google.com/style/text-formatting).
pub fn reconstruct_markdown<'a>(
group: &[(usize, Event<'a>)],
state: Option<State<'a>>,
) -> (String, State<'a>) {
let events = group.iter().map(|(_, event)| event);
let code_block_token_count = calculate_code_block_token_count(events.clone()).unwrap_or(3);
let mut markdown = String::new();
let options = Options {
code_block_token_count,
list_token: '-',
emphasis_token: '_',
strong_token: "**",
..Options::default()
};
// Advance the true state, but throw away the rendered Markdown
// since it can contain unwanted padding.
let new_state = cmark_resume_with_options(
events.clone(),
String::new(),
state.clone(),
options.clone(),
)
.unwrap();
// Block quotes and lists add padding to the state, which is
// reflected in the rendered Markdown. We want to capture the
// Markdown without the padding to remove the effect of these
// structural elements. Similarly, we don't want extra newlines at
// the start.
let simplified_state = state.map(|state| State {
newlines_before_start: 0,
padding: Vec::new(),
..state
});
cmark_resume_with_options(events, &mut markdown, simplified_state, options).unwrap();
// Even with `newlines_before_start` set to zero, we get a leading
// `\n` for code blocks (since they must start on a new line). We
// can safely trim this here since we know that we always
// reconstruct Markdown for a self-contained group of events.
(String::from(markdown.trim_start_matches('\n')), new_state)
}
#[derive(Debug, PartialEq)]
pub struct ExtractedMessage {
pub message: String,
pub comment: String,
}
impl From<&str> for ExtractedMessage {
fn from(s: &str) -> Self {
ExtractedMessage {
message: s.to_owned(),
comment: "".into(),
}
}
}
/// Extract translatable strings from `document`.
///
/// # Examples
///
/// Structural markup like headings and lists are removed from the
/// messages:
///
/// ```
/// use mdbook_i18n_helpers::extract_messages;
///
/// assert_eq!(
/// extract_messages("# A heading"),
/// vec![(1, "A heading".into())],
/// );
/// assert_eq!(
/// extract_messages(
/// "1. First item\n\
/// 2. Second item\n"
/// ),
/// vec![
/// (1, "First item".into()),
/// (2, "Second item".into()),
/// ],
/// );
/// ```
///
/// Indentation due to structural elements like block quotes and lists
/// is ignored:
///
/// ```
/// use mdbook_i18n_helpers::extract_messages;
///
/// let messages = extract_messages(
/// "> * Hello, this is a\n\
/// > list in a quote.\n\
/// >\n\
/// > This is the second\n\
/// > paragraph.\n"
/// );
/// assert_eq!(
/// messages,
/// vec![
/// (1, "Hello, this is a list in a quote.".into()),
/// (4, "This is the second paragraph.".into()),
/// ],
/// );
/// ```
pub fn extract_messages(document: &str) -> Vec<(usize, ExtractedMessage)> {
let events = extract_events(document, None);
let mut messages = Vec::new();
let mut state = None;
for group in group_events(&events) {
match group {
Group::Translate { events, comment } => {
if let Some((lineno, _)) = events.first() {
let (text, new_state) = reconstruct_markdown(&events, state);
// Skip empty messages since they are special:
// they contains the PO file metadata.
if !text.trim().is_empty() {
messages.push((
*lineno,
ExtractedMessage {
message: text,
comment,
},
));
}
state = Some(new_state);
}
}
Group::Skip(events) => {
let (_, new_state) = reconstruct_markdown(&events, state);
state = Some(new_state);
}
}
}
messages
}
/// Trim `new_events` if they're wrapped in an unwanted paragraph.
///
/// If `new_events` is wrapped in a paragraph and `old_events` isn't,
/// then the paragraph is removed. This is useful when a text event
/// has been wrapped in a paragraph:
///
/// ```
/// use pulldown_cmark::{Event, Tag, TagEnd};
/// use mdbook_i18n_helpers::{extract_events, reconstruct_markdown, trim_paragraph};
///
/// let old_events = vec![(1, Event::Text("A line of text".into()))];
/// let (markdown, _) = reconstruct_markdown(&old_events, None);
/// let new_events = extract_events(&markdown, None);
/// // The stand-alone text has been wrapped in an extra paragraph:
/// assert_eq!(
/// new_events,
/// &[
/// (1, Event::Start(Tag::Paragraph)),
/// (1, Event::Text("A line of text".into())),
/// (1, Event::End(TagEnd::Paragraph)),
/// ],
/// );
///
/// assert_eq!(
/// trim_paragraph(&new_events, &old_events),
/// &[(1, Event::Text("A line of text".into()))],
/// );
/// ```
pub fn trim_paragraph<'a, 'event>(
new_events: &'a [(usize, Event<'event>)],
old_events: &'a [(usize, Event<'event>)],
) -> &'a [(usize, Event<'event>)] {
use pulldown_cmark::Event::{End, Start};
use pulldown_cmark::Tag::Paragraph;
match new_events {
[(_, Start(Paragraph)), inner @ .., (_, End(TagEnd::Paragraph))] => match old_events {
[(_, Start(Paragraph)), .., (_, End(TagEnd::Paragraph))] => new_events,
[..] => inner,
},
[..] => new_events,
}
}
/// Translate `events` using `catalog`.
pub fn translate_events<'a>(
events: &'a [(usize, Event<'a>)],
catalog: &'a Catalog,
) -> Vec<(usize, Event<'a>)> {
let mut translated_events = Vec::new();
let mut state = None;
for group in group_events(events) {
match group {
Group::Translate { events, .. } => {
// Reconstruct the message.
let (msgid, new_state) = reconstruct_markdown(&events, state.clone());
let translated = catalog
.find_message(None, &msgid, None)
.filter(|msg| !msg.flags().is_fuzzy() && msg.is_translated())
.and_then(|msg| msg.msgstr().ok());
match translated {
Some(msgstr) => {
// Generate new events for `msgstr`, taking
// care to trim away unwanted paragraphs.
translated_events.extend_from_slice(trim_paragraph(
&extract_events(msgstr, state),
&events,
));
}
None => translated_events.extend_from_slice(&events),
}
// Advance the state.
state = Some(new_state);
}
Group::Skip(events) => {
// Copy the events unchanged to the output.
translated_events.extend_from_slice(&events);
// Advance the state.
let (_, new_state) = reconstruct_markdown(&events, state);
state = Some(new_state);
}
}
}
translated_events
}
#[cfg(test)]
mod tests {
use super::*;
use pretty_assertions::assert_eq;
use pulldown_cmark::CodeBlockKind;
use pulldown_cmark::Event::*;
use pulldown_cmark::HeadingLevel::*;
use pulldown_cmark::Tag::*;
/// Extract messages in `document`, assert they match `expected`.
#[track_caller]
fn assert_extract_messages(document: &str, expected: &[(usize, &str)]) {
assert_eq!(
extract_messages(document)
.iter()
.map(|(lineno, msg)| (*lineno, &msg.message[..]))
.collect::<Vec<_>>(),
expected,
);
}
#[test]
fn extract_events_empty() {
assert_eq!(extract_events("", None), vec![]);
}
#[test]
fn extract_events_paragraph() {
assert_eq!(
extract_events("foo bar", None),
vec![
(1, Start(Paragraph)),
(1, Text("foo bar".into())),
(1, End(TagEnd::Paragraph)),
]
);
}
#[test]
fn extract_events_softbreak() {
assert_eq!(
extract_events("foo\nbar", None),
vec![
(1, Start(Paragraph)),
(1, Text("foo".into())),
(1, Text(" ".into())),
(2, Text("bar".into())),
(1, End(TagEnd::Paragraph)),
]
);
}
#[test]
fn extract_events_heading() {
assert_eq!(
extract_events("# Foo Bar", None),
vec![
(
1,
Start(Tag::Heading {
level: H1,
id: None,
classes: vec![],
attrs: vec![]
})
),
(1, Text("Foo Bar".into())),
(1, End(TagEnd::Heading(H1))),
]
);
}
#[test]
fn extract_events_list_item() {
assert_eq!(
extract_events("* foo bar", None),
vec![
(1, Start(List(None))),
(1, Start(Item)),
(1, Text("foo bar".into())),
(1, End(TagEnd::Item)),
(1, End(TagEnd::List(false))),
]
);
}
#[test]
fn extract_events_code_block() {
let (_, state) =
reconstruct_markdown(&[(1, Start(CodeBlock(CodeBlockKind::Indented)))], None);
assert_eq!(
extract_events("foo\nbar\nbaz", Some(state)),
vec![
(1, Text("foo\n".into())),
(2, Text("bar\n".into())),
(3, Text("baz".into())),
]
);
// Compare with extraction without state:
assert_eq!(
extract_events("foo\nbar\nbaz", None),
vec![
(1, Start(Paragraph)),
(1, Text("foo".into())),
(1, Text(" ".into())),
(2, Text("bar".into())),
(2, Text(" ".into())),
(3, Text("baz".into())),
(1, End(TagEnd::Paragraph)),
]
);
}
#[test]
fn extract_events_comments() {
assert_eq!(
extract_events("<!-- mdbook-xgettext:skip -->\nHello", None),
vec![
(1, Start(HtmlBlock)),
(1, Html("<!-- mdbook-xgettext:skip -->\n".into())),
(1, End(TagEnd::HtmlBlock)),
(2, Start(Paragraph)),
(2, Text("Hello".into())),
(2, End(TagEnd::Paragraph)),
]
);
}
#[test]
fn extract_messages_empty() {
assert_extract_messages("", &[]);
}
#[test]
fn extract_messages_keep_empty_inline_html() {
// Keep inline html tags
assert_extract_messages("<span></span>", &[(1, "<span></span>")]);
}
#[test]
fn extract_messages_keep_whitespace_inline_html() {
// span is an inline html tag so even whitespace is kept as is
assert_extract_messages("<span> </span>", &[(1, "<span> </span>")]);
}
#[test]
fn extract_messages_ignore_whitespace_only_block_html() {
// Whitespace in block level html tags is ignored
assert_extract_messages("<p> </p>", &[]);
}
#[test]
fn extract_messages_single_line() {
assert_extract_messages("This is a paragraph.", &[(1, "This is a paragraph.")]);
}
#[test]
fn extract_messages_simple() {
assert_extract_messages(
"This is\n\
the first\n\
paragraph.🦀\n\
\n\
Second paragraph.",
&[
(1, "This is the first paragraph.🦀"),
(5, "Second paragraph."),
],
);
}
#[test]
fn extract_messages_leading_newlines() {
assert_extract_messages(
"\n\
\n\
\n\
This is the\n\
first paragraph.",
&[(4, "This is the first paragraph.")],
);
}
#[test]
fn extract_messages_trailing_newlines() {
assert_extract_messages(
"This is\n\
a paragraph.\n\
\n\
\n",
&[(1, "This is a paragraph.")],
);
}
#[test]
fn extract_messages_styled_text() {
// The parser normalizes "*emphasis*" to "_emphasis_" and
// "__strong emphasis__" to "**strong emphasis**".
assert_extract_messages(
"**This** __~~message~~__ _has_ `code` *style*\n",
&[(1, "**This** **~~message~~** _has_ `code` _style_")],
);
}
#[test]
fn extract_messages_inline_html() {
// Inline HTML tag is kept as is in the translation.
assert_extract_messages(
"Hi from <span dir=\"ltr\">Rust</div>",
&[(1, "Hi from <span dir=\"ltr\">Rust</div>")],
);
}
#[test]
fn extract_messages_block_html() {
// block level HTML tag is skipped, but text inside is extracted.
assert_extract_messages(
"<div class=\"warning\">\n\
\n\
Beware of the dog!\n\
\n\
</div>",
&[(3, "Beware of the dog!")],
);
}
#[test]
fn extract_messages_mixed_html() {
// block level HTML tag is skipped, but text inside is extracted with inline html as is.
assert_extract_messages(
"<div>\n\
\n\
Hi from <span dir=\"ltr\">Rust</span>\n\
\n\
</div>",
&[(3, "Hi from <span dir=\"ltr\">Rust</span>")],
);
}
#[test]
fn extract_messages_inline_link() {
assert_extract_messages(
"See [this page](https://example.com) for more info.",
&[(1, "See [this page](https://example.com) for more info.")],
);
}
#[test]
fn extract_messages_reference_link() {
assert_extract_messages(
"See [this page][1] for more info.\n\n\
[1]: https://example.com",
// The parser expands reference links on the fly.
&[(1, "See [this page](https://example.com) for more info.")],
);
}
#[test]
fn extract_messages_collapsed_link() {
// We make the parser expand collapsed links on the fly.
assert_extract_messages(
"Click [here][]!\n\n\
[here]: http://example.net/",
&[(1, "Click [here](http://example.net/)!")],
);
}
#[test]
fn extract_messages_shortcut_link() {
assert_extract_messages(
"Click [here]!\n\n\
[here]: http://example.net/",
&[(1, "Click [here](http://example.net/)!")],
);
}
#[test]
fn extract_messages_autolink() {
assert_extract_messages(
"Visit <http://example.net>!",
&[(1, "Visit <http://example.net>!")],
);
}
#[test]
fn extract_messages_email() {
assert_extract_messages(
"Contact <[email protected]>!",
&[(1, "Contact <[email protected]>!")],
);
}
#[test]
fn extract_messages_broken_reference_link() {
// A reference link without the corresponding link definition
// results in an escaped link.
//
// See `SourceMap::extract_messages` for a more complex
// approach which can work around this in some cases.
assert_extract_messages("[foo][unknown]", &[(1, r"\[foo\]\[unknown\]")]);
}
#[test]
fn extract_messages_footnotes() {
assert_extract_messages(
"
The document[^1] text.
[^1]: The footnote text.
",
&[
(2, "The document[^1] text."), //
(4, "The footnote text."),
],
);
}
#[test]
fn extract_messages_block_quote() {
assert_extract_messages(
r"One of my favorite quotes is:
> Don't believe everything you read on the Internet.
>
> I didn't say this second part, but I needed a paragraph for testing.
--Abraham Lincoln
",
&[
(1, "One of my favorite quotes is:"),
(3, "Don't believe everything you read on the Internet."),
(
5,
"I didn't say this second part, but I needed a paragraph for testing.",
),
(7, "\\--Abraham Lincoln"),
],
);
}
#[test]
fn extract_messages_table() {
let input = "\
| Module Type | Description\n\
|-------------------|-------------------------\n\
| `rust_binary` | Produces a Rust binary.\n\
| `rust_library` | Produces a Rust library.\n\
";
assert_extract_messages(
input,
&[
(1, "Module Type"),
(1, "Description"),
(3, "`rust_binary`"),
(3, "Produces a Rust binary."),
(4, "`rust_library`"),
(4, "Produces a Rust library."),
],
);
}
#[test]
fn extract_messages_code_block() {
assert_extract_messages(
"Preamble\n```rust\n// Example:\nfn hello() {\n some_code()\n\n todo!()\n}\n```\nPostamble",
&[
(1, "Preamble"),
(
3,
"// Example:\n",
),
(10, "Postamble"),
],
);
}
#[test]
fn extract_messages_two_code_blocks() {
assert_extract_messages(
"```\n\
\"First\" block\n\
```\n\
```\n\
\"Second\" block\n\
```\n\
",
&[
(1, "```\n\"First\" block\n```"), //
(4, "```\n\"Second\" block\n```"),
],
);
}
#[test]
fn extract_messages_quoted_code_block() {
assert_extract_messages(
"\
> Preamble\n\
> ```rust\n\
> fn hello() {\n\
> some_code()\n\
>\n\
> // FIXME: do something here!\n\
> todo!()\n\
> }\n\
> ```\n\
> Postamble",
&[
(1, "Preamble"),
(6, "// FIXME: do something here!\n"),
(10, "Postamble"),
],
);
}
#[test]
fn extract_messages_code_block_with_block_comment() {
assert_extract_messages(
"```rust\n\
/* block comment\n\
* /* nested block comment\n\
* */\n\
* \n\
* \n\
* \n\
* */\n\
```\n",
&[(
2,
"/* block comment\n* /* nested block comment\n* */\n* \n* \n* \n* */",
)],
);
}
#[test]
fn extract_messages_code_block_with_continuous_line_comments() {
assert_extract_messages(
r"```rust
// continuous
// line
// comments
{
// continuous
// line
// comments
let a = 1; // single line comment
let b = 1; // single line comment
}
```",
&[
(2, "// continuous\n// line\n// comments\n"),
(6, "// continuous\n // line\n // comments\n"),
(9, "// single line comment\n"),
(10, "// single line comment\n"),
],
);
}
#[test]
fn extract_messages_multi_language_code_blocks() {
assert_extract_messages(
r#"```c
// C
'C'; "C";
```
```html
<!-- HTML
HTML -->
```
```javascript
`JavaScript`
```
```ruby
# Ruby
```"#,
&[
(2, "// C\n'C'"),
(3, "\"C\""),
(6, "<!-- HTML\nHTML -->"),
(10, "`JavaScript`"),
(13, "# Ruby\n"),
],
);
}
#[test]
fn extract_messages_details() {
// This isn't great: we lose text following a HTML tag:
assert_extract_messages(
"Preamble\n\
<details>\n\
Some Details\n\
</details>\n\
\n\
Postamble",
&[
(1, "Preamble"), //
// Missing "Some Details"
(6, "Postamble"),
],
);
// It works well enough when `<details>` has blank lines
// before and after.
assert_extract_messages(
"Preamble\n\
\n\
<details>\n\
\n\
Some Details\n\
\n\
</details>\n\
\n\
Postamble",
&[
(1, "Preamble"), //
(5, "Some Details"),
(9, "Postamble"),
],
);
}
#[test]
fn extract_messages_list() {
assert_extract_messages(
"Some text\n * List item 1🦀\n * List item 2\n\nMore text",
&[
(1, "Some text"), //
(2, "List item 1🦀"),
(3, "List item 2"),
(5, "More text"),
],
);
}
#[test]
fn extract_messages_multilevel_list() {
assert_extract_messages(
"Some text\n * List item 1\n * List item 2\n * Sublist 1\n * Sublist 2\n\nMore text",
&[
(1, "Some text"), //
(2, "List item 1"),
(3, "List item 2"),
(4, "Sublist 1"),
(5, "Sublist 2"),
(7, "More text"),
],
);
}
#[test]
fn extract_messages_list_with_paragraphs() {
assert_extract_messages(
r"* Item 1.
* Item 2,
two lines.
* Sub 1.
* Sub 2.
",
&[
(1, "Item 1."),
(2, "Item 2, two lines."),
(5, "Sub 1."),
(6, "Sub 2."),
],
);
}
#[test]
fn extract_messages_headings() {
assert_extract_messages(
r"Some text
# Headline News🦀
* A
* List
## Subheading
",
&[
(1, "Some text"),
(2, "Headline News🦀"),
(4, "A"),
(5, "List"),
(7, "Subheading"),
],
);
}
#[test]
fn extract_messages_code_followed_by_details() {
// This is a regression test for an error that would
// incorrectly combine CodeBlock and HTML.
assert_extract_messages(
r"```bob
// BOB
```
<details>
* Blah blah
</details>
",
&[
(1, "```bob\n// BOB\n```"), //
(7, "Blah blah"),
],
);
}
#[test]
fn extract_messages_backslashes() {
// Demonstrate how a single backslash in the Markdown becomes
// a backslash-escaped backslash when we extract the text.
// This is consistent with the CommonMark spec:
// https://spec.commonmark.org/0.30/#backslash-escapes.
// However, it causes problems for LaTeX preprocessors:
// https://github.com/google/mdbook-i18n-helpers/issues/105.
assert_extract_messages(
r"
$$
\sum_{n=1}^{\infty} 2^{-n} = 1
$$
",
&[(2, r"$$ \\sum\_{n=1}^{\infty} 2^{-n} = 1 $$")],
);
}
#[test]
fn extract_messages_skip_simple() {
assert_extract_messages(
r"<!-- mdbook-xgettext:skip -->
This is a paragraph.",
&[],
);
}
#[test]
fn extract_messages_skip_next_paragraph_ok() {
assert_extract_messages(
r"<!-- mdbook-xgettext:skip -->
This is a paragraph.
This should be translated.
",
&[(4, "This should be translated.")],
);
}
#[test]
fn extract_messages_skip_next_codeblock() {
assert_extract_messages(
r"<!-- mdbook-xgettext:skip -->
```
def f(x): return x * x
```
This should be translated.
",
&[(5, "This should be translated.")],
);
}
#[test]
fn extract_messages_skip_back_to_back() {
assert_extract_messages(
r"<!-- mdbook-xgettext:skip -->
```
def f(x): return x * x
```
<!-- mdbook-xgettext:skip -->
This should not translated.
But *this* should!
",
&[(8, "But _this_ should!")],
);
}
#[test]
fn extract_messages_block_html_skip() {
// The comment is a block level html tag.
assert_extract_messages(
"<!-- mdbook-xgettext:skip -->\n\
This is ignored\n\
\n\
but this is not",
&[(4, "but this is not")],
);
}
#[test]
fn extract_messages_inline_html_skips() {
// The comment is an inline html tag.
assert_extract_messages(
"
this should be translated <!-- mdbook-xgettext:skip --> but not this.
... nor this.
But *this* should!",
&[(2, "this should be translated "), (5, "But _this_ should!")],
);
}
#[test]
fn extract_messages_skipping_second_item() {
assert_extract_messages(
"
* A
<!-- mdbook-xgettext:skip -->
* B
* C
",
&[(2, "A"), (5, "C")],
);
}
#[test]
fn extract_messages_skipping_second_paragraphed_item() {
assert_extract_messages(
"
* A
<!-- mdbook-xgettext:skip -->
* B
* C
",
&[(2, "A"), (7, "C")],
);
}
#[test]
fn extract_messages_skipping_inline_second_item() {
// This isn't great: we lose text following a HTML comment.
// Very similar to the failure mode of the
// `extract_messages_details` test.
//
// The root cause is due to the Markdown spec and how the
// Markdown parser treats HTML blocks. The text that
// immediately follows an HTML block on the same line is
// included as part of the HTML block.
assert_extract_messages(
"
* A
* <!-- mdbook-xgettext:skip --> B
* C
",
&[(2, "A")],
);
}
#[test]
fn extract_messages_inline_skip_to_end_of_block() {
assert_extract_messages(
"foo <!-- mdbook-xgettext:skip --> **bold** bar
still skipped
not-skipped",
&[(1, "foo "), (4, "not-skipped")],
);
}
#[test]
fn extract_messages_automatic_skipping_nontranslatable_codeblocks_simple() {
assert_extract_messages(
r"
```python
def g(x):
this_should_be_skipped_no_strings_or_comments()
```
",
&[],
);
}
#[test]
fn extract_messages_automatic_skipping_nontranslatable_codeblocks() {
assert_extract_messages(
r#"
```python
def f(x):
print("this should be translated")
```
```python
def g(x):
but_this_should_not()
```
"#,
&[(4, "\"this should be translated\"")],
);
}
#[test]
fn extract_messages_without_language_specifier() {
assert_extract_messages(
r#"
```
def f(x):
print("this should be translated")
```
```
def g(x):
but_this_should_not()
```
"#,
&[(
2,
"```\ndef f(x):\n print(\"this should be translated\")\n```",
)],
);
}
#[test]
fn extract_messages_codeblock_in_codeblock() {
assert_extract_messages(
r#"
````
```
// codeblock in codeblock
```
````
"#,
&[(2, "````\n```\n// codeblock in codeblock\n```\n````")],
);
}
#[test]
fn extract_message_comments() {
assert_eq!(
extract_messages(
"
<!-- mdbook-xgettext:comment: first comment! -->
Hello world!
"
),
vec![(
3,
ExtractedMessage {
message: "Hello world!".into(),
comment: "first comment!".into(),
}
)]
);
}
#[test]
fn extract_message_comments_multiple_joined() {
assert_eq!(
extract_messages(
"
<!-- mdbook-xgettext:comment: this is a test -->
<!-- mdbook-xgettext:comment: of a comment that spans. -->
Greetings!
"
),
vec![(
4,
ExtractedMessage {
message: "Greetings!".into(),
comment: "this is a test of a comment that spans.".into(),
}
)]
);
}
#[test]
fn extract_message_multiple_comments() {
assert_eq!(
extract_messages(
"
before-no-comment
<!-- mdbook-xgettext:comment: another -->
Hello again, this is some text
with a comment on it.
<!-- mdbook-xgettext:comment: one more comment. -->
after
after-no-comment
"
),
vec![
(
2,
ExtractedMessage {
message: "before-no-comment".into(),
comment: "".into(),
}
),
(
5,
ExtractedMessage {
message: "Hello again, this is some text with a comment on it.".into(),
comment: "another".into(),
}
),
(
9,
ExtractedMessage {
message: "after".into(),
comment: "one more comment.".into(),
}
),
(
11,
ExtractedMessage {
message: "after-no-comment".into(),
comment: "".into(),
}
),
]
);
}
#[test]
fn extract_message_comments_on_codeblock() {
assert_eq!(
extract_messages(
r#"
<!-- mdbook-xgettext:comment: greetings! -->
```python
print("Hello world")
```
"#
),
vec![(
4,
ExtractedMessage {
message: "\"Hello world\"".into(),
comment: "greetings!".into(),
}
),]
);
}
}