Respect original transcript spacing when splitting by sentences
Starting in V2, we split transcripts into individual sentences. Currently, this is done by building up from transcript hypothesis parts. However, it turns out that sometimes the speech model may return a mix of character types, and we should rely on the whitespace as given by the full text of the transcript rather than a single type of spacing per language. Additionally, the speech models also sometimes return a delimiter character inside of hypothesis parts, so we must remove this before trying to use any hypothesis parts. Bug: b/330271007 Change-Id: Ib7dda966140fdd824a4a80a454e3fcc62c06d01b Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5402048 Reviewed-by: Ahmed Nasr <anasr@google.com> Reviewed-by: Li Lin <llin@chromium.org> Commit-Queue: Benjamin Zielinski <bzielinski@google.com> Cr-Commit-Position: refs/heads/main@{#1280884}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
26ecb67fad
commit
64cb892523
@ -32,16 +32,6 @@ constexpr std::string_view kRecognitionStatus = "recognitionStatus";
|
||||
constexpr std::string_view kMetadataVersionNumber = "version";
|
||||
constexpr std::string_view kGroupIdKey = "groupId";
|
||||
|
||||
constexpr auto kLanguagesWithoutWhiteSpaces =
|
||||
base::MakeFixedFlatSet<std::string_view>({
|
||||
"ja", // Japanese
|
||||
"ko_KR", // Korean
|
||||
"th", // Thai
|
||||
"zh", // Chinese
|
||||
"zh_CN", // Chinese Simplified
|
||||
"zh_TW", // Chinese Traditional
|
||||
});
|
||||
|
||||
// Source of common English abbreviations: icu's sentence break exception list
|
||||
// https://source.chromium.org/chromium/chromium/src/+/main:third_party/icu/source/data/brkitr/en.txt.
|
||||
constexpr auto kEnglishAbbreviationsInLowerCase =
|
||||
@ -82,19 +72,6 @@ base::Value::Dict HypothesisPartsToDict(
|
||||
return hypothesis_part_dict;
|
||||
}
|
||||
|
||||
std::string GetSentenceText(const std::vector<media::HypothesisParts>& sentence,
|
||||
const std::string& caption_language) {
|
||||
std::vector<std::string_view> sentence_text;
|
||||
for (const auto& hypothesisPart : sentence) {
|
||||
sentence_text.push_back(hypothesisPart.text[0]);
|
||||
}
|
||||
return base::JoinString(
|
||||
sentence_text,
|
||||
/*separator=*/kLanguagesWithoutWhiteSpaces.contains(caption_language)
|
||||
? ""
|
||||
: " ");
|
||||
}
|
||||
|
||||
std::vector<media::HypothesisParts> recalculateHypothesisPartTimeStamps(
|
||||
std::vector<media::HypothesisParts> sentence) {
|
||||
if (sentence.empty()) {
|
||||
@ -177,6 +154,9 @@ std::vector<std::unique_ptr<ProjectorTranscript>> SplitTranscriptIntoSentences(
|
||||
caption_language);
|
||||
base::TimeDelta sentence_start_time = paragraph_start_time;
|
||||
base::TimeDelta sentence_end_time;
|
||||
const std::u16string full_text =
|
||||
base::UTF8ToUTF16(paragraph_transcript->text());
|
||||
size_t previous_sentence_end_pos = 0;
|
||||
for (uint i = 0; i < sentence_hypothesis_parts.size(); ++i) {
|
||||
std::vector<media::HypothesisParts> current_sentence_hypothesis_parts =
|
||||
recalculateHypothesisPartTimeStamps(
|
||||
@ -190,12 +170,26 @@ std::vector<std::unique_ptr<ProjectorTranscript>> SplitTranscriptIntoSentences(
|
||||
? sentence_hypothesis_parts[i + 1][0].hypothesis_part_offset +
|
||||
paragraph_start_time
|
||||
: paragraph_end_time;
|
||||
const std::string sentence_text =
|
||||
GetSentenceText(current_sentence_hypothesis_parts, caption_language);
|
||||
std::u16string sentence_text = u"";
|
||||
if (current_sentence_hypothesis_parts.size() > 0) {
|
||||
std::u16string sentence_end_word =
|
||||
base::UTF8ToUTF16(current_sentence_hypothesis_parts.back().text[0]);
|
||||
|
||||
// Remove the delimiter character sometimes added by the speech service.
|
||||
base::RemoveChars(sentence_end_word, u"\u2581", &sentence_end_word);
|
||||
const size_t current_sentence_end_pos =
|
||||
full_text.find(sentence_end_word, previous_sentence_end_pos) +
|
||||
sentence_end_word.length();
|
||||
sentence_text = full_text.substr(
|
||||
previous_sentence_end_pos,
|
||||
(current_sentence_end_pos - previous_sentence_end_pos));
|
||||
base::TrimString(sentence_text, u" ", &sentence_text);
|
||||
previous_sentence_end_pos = current_sentence_end_pos;
|
||||
}
|
||||
sentence_transcripts.push_back(std::make_unique<ProjectorTranscript>(
|
||||
sentence_start_time, sentence_end_time,
|
||||
/*group_id=*/paragraph_start_time.InMilliseconds(), sentence_text,
|
||||
current_sentence_hypothesis_parts));
|
||||
/*group_id=*/paragraph_start_time.InMilliseconds(),
|
||||
base::UTF16ToUTF8(sentence_text), current_sentence_hypothesis_parts));
|
||||
// Next sentence's start timestamp is current sentence's end timestamp.
|
||||
sentence_start_time = sentence_end_time;
|
||||
}
|
||||
|
@ -48,6 +48,8 @@ class MetadataItem {
|
||||
|
||||
base::TimeDelta& end_time() { return end_time_; }
|
||||
|
||||
std::string& text() { return text_; }
|
||||
|
||||
// Return the serialized metadata item. This is used for storage.
|
||||
virtual base::Value::Dict ToJson() = 0;
|
||||
|
||||
|
@ -154,6 +154,61 @@ constexpr char kCompleteMetadataV2Template[] = R"({
|
||||
"tableOfContent": []
|
||||
})";
|
||||
|
||||
constexpr char kCompleteMetadataV2WithDelimiterTemplate[] = R"({
|
||||
"captions": [
|
||||
{
|
||||
"endOffset": 3000,
|
||||
"hypothesisParts": [
|
||||
{
|
||||
"offset": 0,
|
||||
"text": [
|
||||
"▁transcript"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 2000,
|
||||
"text": [
|
||||
"▁text"
|
||||
]
|
||||
}
|
||||
],
|
||||
"startOffset": 1000,
|
||||
"groupId": 1000,
|
||||
"text": "transcript text"
|
||||
},
|
||||
{
|
||||
"endOffset": 5000,
|
||||
"hypothesisParts": [
|
||||
{
|
||||
"offset": 0,
|
||||
"text": [
|
||||
"▁transcript"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 1000,
|
||||
"text": [
|
||||
"▁text"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 1500,
|
||||
"text": [
|
||||
"▁2"
|
||||
]
|
||||
}
|
||||
],
|
||||
"startOffset": 3000,
|
||||
"groupId": 3000,
|
||||
"text": "transcript text 2"
|
||||
}
|
||||
],
|
||||
"captionLanguage": "en",
|
||||
"recognitionStatus": 1,
|
||||
"version": 2,
|
||||
"tableOfContent": []
|
||||
})";
|
||||
|
||||
constexpr char kCompleteMetadataV2MultipleSentenceTemplate[] = R"({
|
||||
"captions": [
|
||||
{
|
||||
@ -2077,6 +2132,121 @@ constexpr char kCompleteMetadataV2ChineseTemplate[] = R"({
|
||||
"version": 2
|
||||
})";
|
||||
|
||||
constexpr char kCompleteMetadataV2JapaneseWithLatinCharactersTemplate[] = R"({
|
||||
"captions": [
|
||||
{
|
||||
"endOffset": 11000,
|
||||
"hypothesisParts": [
|
||||
{
|
||||
"offset": 0,
|
||||
"text": [
|
||||
"こ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 1000,
|
||||
"text": [
|
||||
"れ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 2000,
|
||||
"text": [
|
||||
"は、"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 3000,
|
||||
"text": [
|
||||
"3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 4000,
|
||||
"text": [
|
||||
"km"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 5000,
|
||||
"text": [
|
||||
"などの"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 6000,
|
||||
"text": [
|
||||
"数字を"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 7000,
|
||||
"text": [
|
||||
"含むラ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 8000,
|
||||
"text": [
|
||||
"ンダム"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 9000,
|
||||
"text": [
|
||||
"なテキ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 10000,
|
||||
"text": [
|
||||
"ストです。"
|
||||
]
|
||||
}
|
||||
],
|
||||
"startOffset": 0,
|
||||
"groupId": 0,
|
||||
"text": "これは、3 km などの数字を含むランダムなテキストです。"
|
||||
},
|
||||
{
|
||||
"endOffset": 15000,
|
||||
"hypothesisParts": [
|
||||
{
|
||||
"offset": 0,
|
||||
"text": [
|
||||
"これ"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 1000,
|
||||
"text": [
|
||||
"も分"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 2000,
|
||||
"text": [
|
||||
"割した"
|
||||
]
|
||||
},
|
||||
{
|
||||
"offset": 3000,
|
||||
"text": [
|
||||
"い文です。"
|
||||
]
|
||||
}
|
||||
],
|
||||
"startOffset": 11000,
|
||||
"groupId": 0,
|
||||
"text": "これも分割したい文です。"
|
||||
}
|
||||
],
|
||||
"captionLanguage": "ja",
|
||||
"recognitionStatus": 1,
|
||||
"version": 2,
|
||||
"tableOfContent": []
|
||||
})";
|
||||
|
||||
void AssertSerializedString(const std::string& expected,
|
||||
const std::string& actual) {
|
||||
std::optional<base::Value> expected_value = base::JSONReader::Read(expected);
|
||||
@ -2134,7 +2304,8 @@ std::string BuildTranscriptJson(
|
||||
BuildHypothesisPartsList(hypothesis_part).c_str());
|
||||
}
|
||||
|
||||
std::unique_ptr<ProjectorMetadata> populateMetadata() {
|
||||
std::unique_ptr<ProjectorMetadata> populateMetadata(
|
||||
bool with_delimiters = false) {
|
||||
base::i18n::SetICUDefaultLocale("en_US");
|
||||
std::unique_ptr<ProjectorMetadata> metadata =
|
||||
std::make_unique<ProjectorMetadata>();
|
||||
@ -2142,10 +2313,13 @@ std::unique_ptr<ProjectorMetadata> populateMetadata() {
|
||||
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
|
||||
|
||||
std::vector<media::HypothesisParts> first_transcript;
|
||||
first_transcript.emplace_back(std::vector<std::string>({"transcript"}),
|
||||
base::Milliseconds(0));
|
||||
first_transcript.emplace_back(std::vector<std::string>({"text"}),
|
||||
base::Milliseconds(2000));
|
||||
first_transcript.emplace_back(
|
||||
std::vector<std::string>(
|
||||
{with_delimiters ? "▁transcript" : "transcript"}),
|
||||
base::Milliseconds(0));
|
||||
first_transcript.emplace_back(
|
||||
std::vector<std::string>({with_delimiters ? "▁text" : "text"}),
|
||||
base::Milliseconds(2000));
|
||||
|
||||
metadata->AddTranscript(std::make_unique<ProjectorTranscript>(
|
||||
/*start_time=*/base::Milliseconds(1000),
|
||||
@ -2155,12 +2329,16 @@ std::unique_ptr<ProjectorMetadata> populateMetadata() {
|
||||
metadata->MarkKeyIdea();
|
||||
|
||||
std::vector<media::HypothesisParts> second_transcript;
|
||||
second_transcript.emplace_back(std::vector<std::string>({"transcript"}),
|
||||
base::Milliseconds(0));
|
||||
second_transcript.emplace_back(std::vector<std::string>({"text"}),
|
||||
base::Milliseconds(1000));
|
||||
second_transcript.emplace_back(std::vector<std::string>({"2"}),
|
||||
base::Milliseconds(1500));
|
||||
second_transcript.emplace_back(
|
||||
std::vector<std::string>(
|
||||
{with_delimiters ? "▁transcript" : "transcript"}),
|
||||
base::Milliseconds(0));
|
||||
second_transcript.emplace_back(
|
||||
std::vector<std::string>({with_delimiters ? "▁text" : "text"}),
|
||||
base::Milliseconds(1000));
|
||||
second_transcript.emplace_back(
|
||||
std::vector<std::string>({with_delimiters ? "▁2" : "2"}),
|
||||
base::Milliseconds(1500));
|
||||
|
||||
metadata->AddTranscript(std::make_unique<ProjectorTranscript>(
|
||||
/*start_time=*/base::Milliseconds(3000),
|
||||
@ -2299,6 +2477,39 @@ std::unique_ptr<ProjectorMetadata> populateMetadataWithLanguageWithoutSpaces() {
|
||||
return metadata;
|
||||
}
|
||||
|
||||
std::unique_ptr<ProjectorMetadata> populateMetadataWithMixedCharacters() {
|
||||
base::i18n::SetICUDefaultLocale("ja");
|
||||
std::unique_ptr<ProjectorMetadata> metadata =
|
||||
std::make_unique<ProjectorMetadata>();
|
||||
metadata->SetCaptionLanguage("ja");
|
||||
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
|
||||
|
||||
std::string paragraph_text =
|
||||
"これは、3 km "
|
||||
"などの数字を含むランダムなテキストです。これも分割したい文です。";
|
||||
const std::vector<std::string> paragraph_words = {
|
||||
"こ", "れ", "は、", "3", "km",
|
||||
"などの", "数字を", "含むラ", "ンダム", "なテキ",
|
||||
"ストです。", "これ", "も分", "割した", "い文です。",
|
||||
};
|
||||
|
||||
std::vector<media::HypothesisParts> paragraph_hypothesis_parts;
|
||||
for (uint i = 0; i < paragraph_words.size(); i++) {
|
||||
paragraph_hypothesis_parts.emplace_back(
|
||||
std::vector<std::string>({paragraph_words[i]}),
|
||||
base::Milliseconds(i * 1000));
|
||||
}
|
||||
const base::TimeDelta paragraph_start_offset = base::Milliseconds(0);
|
||||
const base::TimeDelta paragraph_end_offset =
|
||||
base::Milliseconds(paragraph_words.size() * 1000);
|
||||
|
||||
metadata->AddTranscript(std::make_unique<ProjectorTranscript>(
|
||||
paragraph_start_offset, paragraph_end_offset,
|
||||
paragraph_start_offset.InMilliseconds(), paragraph_text,
|
||||
paragraph_hypothesis_parts));
|
||||
return metadata;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
class ProjectorKeyIdeaTest : public testing::Test {
|
||||
@ -2476,4 +2687,23 @@ TEST_F(ProjectorMetadataTestV2, AddMultiSentenceTranscriptWithChinese) {
|
||||
metadata->Serialize());
|
||||
}
|
||||
|
||||
TEST_F(ProjectorMetadataTestV2, RemoveDelimiter) {
|
||||
std::unique_ptr<ProjectorMetadata> metadata = populateMetadata(true);
|
||||
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
|
||||
|
||||
metadata->SetSpeechRecognitionStatus(RecognitionStatus::kComplete);
|
||||
AssertSerializedString(kCompleteMetadataV2WithDelimiterTemplate,
|
||||
metadata->Serialize());
|
||||
}
|
||||
|
||||
TEST_F(ProjectorMetadataTestV2, PreserveSpacingForMixedCharacters) {
|
||||
std::unique_ptr<ProjectorMetadata> metadata =
|
||||
populateMetadataWithMixedCharacters();
|
||||
metadata->SetMetadataVersionNumber(MetadataVersionNumber::kV2);
|
||||
|
||||
metadata->SetSpeechRecognitionStatus(RecognitionStatus::kComplete);
|
||||
AssertSerializedString(kCompleteMetadataV2JapaneseWithLatinCharactersTemplate,
|
||||
metadata->Serialize());
|
||||
}
|
||||
|
||||
} // namespace ash
|
||||
|
Reference in New Issue
Block a user