0

Respect original transcript spacing when splitting by sentences

Starting in V2, we split transcripts into individual sentences.
Currently, this is done by building up from transcript hypothesis parts.
However, it turns out that sometimes the speech model may return a mix
of character types, and we should rely on the whitespace as given by the
full text of the transcript rather than a single type of spacing per
language.

Additionally, the speech models also sometimes return a delimiter
character inside of hypothesis parts, so we must remove this before
trying to use any hypothesis parts.

Bug: b/330271007
Change-Id: Ib7dda966140fdd824a4a80a454e3fcc62c06d01b
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5402048
Reviewed-by: Ahmed Nasr <anasr@google.com>
Reviewed-by: Li Lin <llin@chromium.org>
Commit-Queue: Benjamin Zielinski <bzielinski@google.com>
Cr-Commit-Position: refs/heads/main@{#1280884}
This commit is contained in:
Benjamin Zielinski
2024-04-01 21:38:02 +00:00
committed by Chromium LUCI CQ
parent 26ecb67fad
commit 64cb892523
3 changed files with 264 additions and 38 deletions

@ -32,16 +32,6 @@ constexpr std::string_view kRecognitionStatus = "recognitionStatus";
constexpr std::string_view kMetadataVersionNumber = "version";
constexpr std::string_view kGroupIdKey = "groupId";
constexpr auto kLanguagesWithoutWhiteSpaces =
base::MakeFixedFlatSet<std::string_view>({
"ja", // Japanese
"ko_KR", // Korean
"th", // Thai
"zh", // Chinese
"zh_CN", // Chinese Simplified
"zh_TW", // Chinese Traditional
});
// Source of common English abbreviations: icu's sentence break exception list
// https://source.chromium.org/chromium/chromium/src/+/main:third_party/icu/source/data/brkitr/en.txt.
constexpr auto kEnglishAbbreviationsInLowerCase =
@ -82,19 +72,6 @@ base::Value::Dict HypothesisPartsToDict(
return hypothesis_part_dict;
}
std::string GetSentenceText(const std::vector<media::HypothesisParts>& sentence,
const std::string& caption_language) {
std::vector<std::string_view> sentence_text;
for (const auto& hypothesisPart : sentence) {
sentence_text.push_back(hypothesisPart.text[0]);
}
return base::JoinString(
sentence_text,
/*separator=*/kLanguagesWithoutWhiteSpaces.contains(caption_language)
? ""
: " ");
}
std::vector<media::HypothesisParts> recalculateHypothesisPartTimeStamps(
std::vector<media::HypothesisParts> sentence) {
if (sentence.empty()) {
@ -177,6 +154,9 @@ std::vector<std::unique_ptr<ProjectorTranscript>> SplitTranscriptIntoSentences(
caption_language);
base::TimeDelta sentence_start_time = paragraph_start_time;
base::TimeDelta sentence_end_time;
const std::u16string full_text =
base::UTF8ToUTF16(paragraph_transcript->text());
size_t previous_sentence_end_pos = 0;
for (uint i = 0; i < sentence_hypothesis_parts.size(); ++i) {
std::vector<media::HypothesisParts> current_sentence_hypothesis_parts =
recalculateHypothesisPartTimeStamps(
@ -190,12 +170,26 @@ std::vector<std::unique_ptr<ProjectorTranscript>> SplitTranscriptIntoSentences(
? sentence_hypothesis_parts[i + 1][0].hypothesis_part_offset +
paragraph_start_time
: paragraph_end_time;
const std::string sentence_text =
GetSentenceText(current_sentence_hypothesis_parts, caption_language);
std::u16string sentence_text = u"";
if (current_sentence_hypothesis_parts.size() > 0) {
std::u16string sentence_end_word =
base::UTF8ToUTF16(current_sentence_hypothesis_parts.back().text[0]);
// Remove the delimiter character sometimes added by the speech service.
base::RemoveChars(sentence_end_word, u"\u2581", &sentence_end_word);
const size_t current_sentence_end_pos =
full_text.find(sentence_end_word, previous_sentence_end_pos) +
sentence_end_word.length();
sentence_text = full_text.substr(
previous_sentence_end_pos,
(current_sentence_end_pos - previous_sentence_end_pos));
base::TrimString(sentence_text, u" ", &sentence_text);
previous_sentence_end_pos = current_sentence_end_pos;
}
sentence_transcripts.push_back(std::make_unique<ProjectorTranscript>(
sentence_start_time, sentence_end_time,
/*group_id=*/paragraph_start_time.InMilliseconds(), sentence_text,
current_sentence_hypothesis_parts));
/*group_id=*/paragraph_start_time.InMilliseconds(),
base::UTF16ToUTF8(sentence_text), current_sentence_hypothesis_parts));
// Next sentence's start timestamp is current sentence's end timestamp.
sentence_start_time = sentence_end_time;
}

@ -48,6 +48,8 @@ class MetadataItem {
base::TimeDelta& end_time() { return end_time_; }
std::string& text() { return text_; }
// Return the serialized metadata item. This is used for storage.
virtual base::Value::Dict ToJson() = 0;

@ -154,6 +154,61 @@ constexpr char kCompleteMetadataV2Template[] = R"({
"tableOfContent": []
})";
constexpr char kCompleteMetadataV2WithDelimiterTemplate[] = R"({
"captions": [
{
"endOffset": 3000,
"hypothesisParts": [
{
"offset": 0,
"text": [
"▁transcript"
]
},
{
"offset": 2000,
"text": [
"▁text"
]
}
],
"startOffset": 1000,
"groupId": 1000,
"text": "transcript text"
},
{
"endOffset": 5000,
"hypothesisParts": [
{
"offset": 0,
"text": [
"▁transcript"
]
},
{
"offset": 1000,
"text": [
"▁text"
]
},
{
"offset": 1500,
"text": [
"▁2"
]
}
],
"startOffset": 3000,
"groupId": 3000,
"text": "transcript text 2"
}
],
"captionLanguage": "en",
"recognitionStatus": 1,
"version": 2,
"tableOfContent": []
})";
constexpr char kCompleteMetadataV2MultipleSentenceTemplate[] = R"({
"captions": [
{
@ -2077,6 +2132,121 @@ constexpr char kCompleteMetadataV2ChineseTemplate[] = R"({
"version": 2
})";
constexpr char kCompleteMetadataV2JapaneseWithLatinCharactersTemplate[] = R"({
"captions": [
{
"endOffset": 11000,
"hypothesisParts": [
{
"offset": 0,
"text": [
""
]
},
{
"offset": 1000,
"text": [
""
]
},
{
"offset": 2000,
"text": [
"は、"
]
},
{
"offset": 3000,
"text": [
"3"
]
},
{
"offset": 4000,
"text": [
"km"
]
},
{
"offset": 5000,
"text": [
"などの"
]
},
{
"offset": 6000,
"text": [
"数字を"
]
},
{
"offset": 7000,
"text": [
"含むラ"
]
},
{
"offset": 8000,
"text": [
"ンダム"
]
},
{
"offset": 9000,
"text": [
"なテキ"
]
},
{
"offset": 10000,
"text": [
"ストです。"
]
}
],
"startOffset": 0,
"groupId": 0,
"text": "これは、3 km などの数字を含むランダムなテキストです。"
},
{
"endOffset": 15000,
"hypothesisParts": [
{
"offset": 0,
"text": [
"これ"
]
},
{
"offset": 1000,
"text": [
"も分"
]
},
{
"offset": 2000,
"text": [
"割した"
]
},
{
"offset": 3000,
"text": [
"い文です。"
]
}
],
"startOffset": 11000,
"groupId": 0,
"text": "これも分割したい文です。"
}
],
"captionLanguage": "ja",
"recognitionStatus": 1,
"version": 2,
"tableOfContent": []
})";
void AssertSerializedString(const std::string& expected,
const std::string& actual) {
std::optional<base::Value> expected_value = base::JSONReader::Read(expected);
@ -2134,7 +2304,8 @@ std::string BuildTranscriptJson(
BuildHypothesisPartsList(hypothesis_part).c_str());
}
std::unique_ptr<ProjectorMetadata> populateMetadata() {
std::unique_ptr<ProjectorMetadata> populateMetadata(
bool with_delimiters = false) {
base::i18n::SetICUDefaultLocale("en_US");
std::unique_ptr<ProjectorMetadata> metadata =
std::make_unique<ProjectorMetadata>();
@ -2142,10 +2313,13 @@ std::unique_ptr<ProjectorMetadata> populateMetadata() {
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
std::vector<media::HypothesisParts> first_transcript;
first_transcript.emplace_back(std::vector<std::string>({"transcript"}),
base::Milliseconds(0));
first_transcript.emplace_back(std::vector<std::string>({"text"}),
base::Milliseconds(2000));
first_transcript.emplace_back(
std::vector<std::string>(
{with_delimiters ? "▁transcript" : "transcript"}),
base::Milliseconds(0));
first_transcript.emplace_back(
std::vector<std::string>({with_delimiters ? "▁text" : "text"}),
base::Milliseconds(2000));
metadata->AddTranscript(std::make_unique<ProjectorTranscript>(
/*start_time=*/base::Milliseconds(1000),
@ -2155,12 +2329,16 @@ std::unique_ptr<ProjectorMetadata> populateMetadata() {
metadata->MarkKeyIdea();
std::vector<media::HypothesisParts> second_transcript;
second_transcript.emplace_back(std::vector<std::string>({"transcript"}),
base::Milliseconds(0));
second_transcript.emplace_back(std::vector<std::string>({"text"}),
base::Milliseconds(1000));
second_transcript.emplace_back(std::vector<std::string>({"2"}),
base::Milliseconds(1500));
second_transcript.emplace_back(
std::vector<std::string>(
{with_delimiters ? "▁transcript" : "transcript"}),
base::Milliseconds(0));
second_transcript.emplace_back(
std::vector<std::string>({with_delimiters ? "▁text" : "text"}),
base::Milliseconds(1000));
second_transcript.emplace_back(
std::vector<std::string>({with_delimiters ? "▁2" : "2"}),
base::Milliseconds(1500));
metadata->AddTranscript(std::make_unique<ProjectorTranscript>(
/*start_time=*/base::Milliseconds(3000),
@ -2299,6 +2477,39 @@ std::unique_ptr<ProjectorMetadata> populateMetadataWithLanguageWithoutSpaces() {
return metadata;
}
std::unique_ptr<ProjectorMetadata> populateMetadataWithMixedCharacters() {
base::i18n::SetICUDefaultLocale("ja");
std::unique_ptr<ProjectorMetadata> metadata =
std::make_unique<ProjectorMetadata>();
metadata->SetCaptionLanguage("ja");
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
std::string paragraph_text =
"これは、3 km "
"などの数字を含むランダムなテキストです。これも分割したい文です。";
const std::vector<std::string> paragraph_words = {
"", "", "は、", "3", "km",
"などの", "数字を", "含むラ", "ンダム", "なテキ",
"ストです。", "これ", "も分", "割した", "い文です。",
};
std::vector<media::HypothesisParts> paragraph_hypothesis_parts;
for (uint i = 0; i < paragraph_words.size(); i++) {
paragraph_hypothesis_parts.emplace_back(
std::vector<std::string>({paragraph_words[i]}),
base::Milliseconds(i * 1000));
}
const base::TimeDelta paragraph_start_offset = base::Milliseconds(0);
const base::TimeDelta paragraph_end_offset =
base::Milliseconds(paragraph_words.size() * 1000);
metadata->AddTranscript(std::make_unique<ProjectorTranscript>(
paragraph_start_offset, paragraph_end_offset,
paragraph_start_offset.InMilliseconds(), paragraph_text,
paragraph_hypothesis_parts));
return metadata;
}
} // namespace
class ProjectorKeyIdeaTest : public testing::Test {
@ -2476,4 +2687,23 @@ TEST_F(ProjectorMetadataTestV2, AddMultiSentenceTranscriptWithChinese) {
metadata->Serialize());
}
TEST_F(ProjectorMetadataTestV2, RemoveDelimiter) {
std::unique_ptr<ProjectorMetadata> metadata = populateMetadata(true);
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
metadata->SetSpeechRecognitionStatus(RecognitionStatus::kComplete);
AssertSerializedString(kCompleteMetadataV2WithDelimiterTemplate,
metadata->Serialize());
}
TEST_F(ProjectorMetadataTestV2, PreserveSpacingForMixedCharacters) {
std::unique_ptr<ProjectorMetadata> metadata =
populateMetadataWithMixedCharacters();
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
metadata->SetSpeechRecognitionStatus(RecognitionStatus::kComplete);
AssertSerializedString(kCompleteMetadataV2JapaneseWithLatinCharactersTemplate,
metadata->Serialize());
}
} // namespace ash