0

Finch experiment: auto-detect text encoding

Experiment b/518968 aims to measure the impact of turning
on auto-encoding detection on Chrome on Android by default.

- Adds methods that tell us if:
  1) auto-encoding detection was attempted, due to lacking
     encoding information from meta tag, header, BOM, etc.
  2) auto-encoding detection successfully detected a new
     encoding which is different from a default one,
     hence would show the page being browsed which would
     otherwise have shown garbled text.

- Selectively turns on text encoding auto-detection by
  default for experiment group.

- Uploads histogram data on the auto-detection logic
  triggering rate and encoding method detected by
  the logic.

The CL will be reverted once the experiment is finished.

BUG=518968

Review URL: https://codereview.chromium.org/1456843002

Cr-Commit-Position: refs/heads/master@{#363716}
This commit is contained in:
jinsukkim
2015-12-07 18:43:43 -08:00
committed by Commit bot
parent 2aaa965828
commit 35a1421d89
11 changed files with 214 additions and 8 deletions

@ -7,7 +7,9 @@
#include <set>
#include <string>
#include "base/command_line.h"
#include "base/memory/singleton.h"
#include "base/metrics/field_trial.h"
#include "base/prefs/overlay_user_pref_store.h"
#include "base/prefs/pref_change_registrar.h"
#include "base/prefs/pref_service.h"
@ -318,6 +320,14 @@ void RegisterLocalizedFontPref(user_prefs::PrefRegistrySyncable* registry,
registry->RegisterIntegerPref(path, val);
}
bool IsAutodetectEncodingEnabledByDefault() {
const std::string group_name = base::FieldTrialList::FindFullName(
"AutodetectEncoding");
return base::StartsWith(group_name,
"Enabled",
base::CompareCase::INSENSITIVE_ASCII);
}
} // namespace
// Watching all these settings per tab is slow when a user has a lot of tabs and
@ -581,9 +591,11 @@ void PrefsTabHelper::RegisterProfilePrefs(
IDS_MINIMUM_FONT_SIZE);
RegisterLocalizedFontPref(registry, prefs::kWebKitMinimumLogicalFontSize,
IDS_MINIMUM_LOGICAL_FONT_SIZE);
bool uses_universal_detector = IsAutodetectEncodingEnabledByDefault() ||
l10n_util::GetStringUTF8(IDS_USES_UNIVERSAL_DETECTOR) == "true";
registry->RegisterBooleanPref(
prefs::kWebKitUsesUniversalDetector,
l10n_util::GetStringUTF8(IDS_USES_UNIVERSAL_DETECTOR) == "true",
uses_universal_detector,
user_prefs::PrefRegistrySyncable::SYNCABLE_PREF);
registry->RegisterStringPref(
prefs::kStaticEncodings,

@ -1556,6 +1556,16 @@ void Document::updateStyleInvalidationIfNeeded()
styleEngine().styleInvalidator().invalidate(*this);
}
bool Document::attemptedToDetermineEncodingFromContentSniffing() const
{
return m_encodingData.attemptedToDetermineEncodingFromContentSniffing();
}
bool Document::encodingWasDetectedFromContentSniffing() const
{
return m_encodingData.encodingWasDetectedFromContentSniffing();
}
void Document::setupFontBuilder(ComputedStyle& documentStyle)
{
FontBuilder fontBuilder(*this);

@ -1002,6 +1002,9 @@ public:
void updateStyleInvalidationIfNeeded();
bool attemptedToDetermineEncodingFromContentSniffing() const;
bool encodingWasDetectedFromContentSniffing() const;
DECLARE_VIRTUAL_TRACE();
bool hasSVGFilterElementsRequiringLayerUpdate() const { return m_layerUpdateSVGFilterElements.size(); }

@ -46,6 +46,8 @@ DocumentEncodingData::DocumentEncodingData(const TextResourceDecoder& decoder)
{
m_encoding = decoder.encoding();
m_wasDetectedHeuristically = decoder.encodingWasDetectedHeuristically();
m_attemptedToDetermineEncodingFromContentSniffing = decoder.attemptedToDetermineEncodingFromContentSniffing();
m_encodingWasDetectedFromContentSniffing = decoder.encodingWasDetectedFromContentSniffing();
m_sawDecodingError = decoder.sawError();
}

@ -48,10 +48,14 @@ public:
void setEncoding(const WTF::TextEncoding&);
bool wasDetectedHeuristically() const { return m_wasDetectedHeuristically; }
bool sawDecodingError() const { return m_sawDecodingError; }
bool attemptedToDetermineEncodingFromContentSniffing() const { return m_attemptedToDetermineEncodingFromContentSniffing; }
bool encodingWasDetectedFromContentSniffing() const { return m_encodingWasDetectedFromContentSniffing; }
private:
WTF::TextEncoding m_encoding;
bool m_wasDetectedHeuristically;
bool m_attemptedToDetermineEncodingFromContentSniffing;
bool m_encodingWasDetectedFromContentSniffing;
bool m_sawDecodingError;
};
@ -62,6 +66,8 @@ inline bool operator!=(const DocumentEncodingData& a, const DocumentEncodingData
{
return a.encoding() != b.encoding()
|| a.wasDetectedHeuristically() != b.wasDetectedHeuristically()
|| a.attemptedToDetermineEncodingFromContentSniffing() != b.attemptedToDetermineEncodingFromContentSniffing()
|| a.encodingWasDetectedFromContentSniffing() != b.encodingWasDetectedFromContentSniffing()
|| a.sawDecodingError() != b.sawDecodingError();
}

@ -19,7 +19,6 @@
Boston, MA 02110-1301, USA.
*/
#include "config.h"
#include "core/html/parser/TextResourceDecoder.h"
@ -401,9 +400,7 @@ String TextResourceDecoder::decode(const char* data, size_t len)
checkForMetaCharset(dataForDecode, lengthForDecode);
if (shouldAutoDetect()) {
WTF::TextEncoding detectedEncoding;
if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
setEncoding(detectedEncoding, EncodingFromContentSniffing);
detectTextEncoding(data, len);
}
ASSERT(m_encoding.isValid());
@ -417,6 +414,16 @@ String TextResourceDecoder::decode(const char* data, size_t len)
return result;
}
void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)
{
WTF::TextEncoding detectedEncoding;
bool detected = blink::detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding);
if (detected && detectedEncoding != encoding())
setEncoding(detectedEncoding, EncodingFromContentSniffing);
else
setEncoding(detectedEncoding, DefaultEncodingAttemptedSniffing);
}
String TextResourceDecoder::flush()
{
// If we can not identify the encoding even after a document is completely
@ -424,9 +431,7 @@ String TextResourceDecoder::flush()
// autodetection is satisfied.
if (m_buffer.size() && shouldAutoDetect()
&& ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_contentType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSContent)))) {
WTF::TextEncoding detectedEncoding;
if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
setEncoding(detectedEncoding, EncodingFromContentSniffing);
detectTextEncoding(m_buffer.data(), m_buffer.size());
}
if (!m_codec)

@ -37,6 +37,7 @@ class CORE_EXPORT TextResourceDecoder {
public:
enum EncodingSource {
DefaultEncoding,
DefaultEncodingAttemptedSniffing,
AutoDetectedEncoding,
EncodingFromContentSniffing,
EncodingFromXMLHeader,
@ -57,6 +58,18 @@ public:
bool encodingWasDetectedHeuristically() const
{
return m_source == AutoDetectedEncoding
|| m_source == EncodingFromContentSniffing
|| m_source == DefaultEncodingAttemptedSniffing;
}
bool encodingWasDetectedFromContentSniffing() const
{
return m_source == EncodingFromContentSniffing;
}
bool attemptedToDetermineEncodingFromContentSniffing() const
{
return m_source == DefaultEncodingAttemptedSniffing
|| m_source == EncodingFromContentSniffing;
}
@ -83,6 +96,7 @@ private:
bool checkForXMLCharset(const char*, size_t, bool& movedDataToBuffer);
void checkForMetaCharset(const char*, size_t);
bool shouldAutoDetect() const;
void detectTextEncoding(const char*, size_t);
ContentType m_contentType;
WTF::TextEncoding m_encoding;

@ -322,6 +322,18 @@ WebDistillabilityFeatures WebDocument::distillabilityFeatures()
return DocumentStatisticsCollector::collectStatistics(*unwrap<Document>());
}
bool WebDocument::attemptedToDetermineEncodingFromContentSniffing() const
{
const Document* document = constUnwrap<Document>();
return document->attemptedToDetermineEncodingFromContentSniffing();
}
bool WebDocument::encodingWasDetectedFromContentSniffing() const
{
const Document* document = constUnwrap<Document>();
return document->encodingWasDetectedFromContentSniffing();
}
WebDocument::WebDocument(const PassRefPtrWillBeRawPtr<Document>& elem)
: WebNode(elem)
{

@ -323,6 +323,69 @@ private:
WebColor m_color;
};
#if OS(ANDROID)
// Array used to convert canonical encoding method name to index to be
// uploaded to UMA for the experiment on text encoding auto detection.
// The listed order should be in sync with the enum definition 'EncodingMethod'
// in tools/metrics/histograms/histograms.xml.
static const char* kEncodingNames[] = {
"UNKNOWN",
"Big5",
"EUC-JP",
"EUC-KR",
"GBK",
"IBM866",
"ISO-2022-JP",
"ISO-8859-10",
"ISO-8859-13",
"ISO-8859-14",
"ISO-8859-15",
"ISO-8859-16",
"ISO-8859-2",
"ISO-8859-3",
"ISO-8859-4",
"ISO-8859-5",
"ISO-8859-6",
"ISO-8859-7",
"ISO-8859-8",
"ISO-8859-8-I",
"KOI8-R",
"KOI8-U",
"Shift_JIS",
"UTF-16LE",
"UTF-8",
"gb18030",
"macintosh",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"windows-874"
};
// Returns the index of the entry in the array that matches
// the given encoding method.
static int encodingToUmaId(const WTF::TextEncoding& encoding)
{
const char* encodingName = encoding.name();
for (size_t i = 0; i < WTF_ARRAY_LENGTH(kEncodingNames); ++i) {
if (!strcasecmp(kEncodingNames[i], encodingName))
return i;
}
return 0;
}
static bool isInternalURL(const KURL& url)
{
const String& protocol = url.protocol();
return protocol == "chrome" || protocol == "chrome-native" || protocol == "swappedout";
}
#endif
} // namespace
// WebView ----------------------------------------------------------------
@ -3957,6 +4020,18 @@ void WebViewImpl::didFinishDocumentLoad(WebLocalFrameImpl* webframe)
if (webframe != mainFrameImpl())
return;
resumeTreeViewCommitsIfRenderingReady();
#if OS(ANDROID)
if (!isInternalURL(webframe->frame()->document()->baseURL()) && page()->settings().usesEncodingDetector()) {
const Document& document = *webframe->frame()->document();
// "AutodetectEncoding.Attempted" is of boolean type - either 0 or 1. Use 2 for the boundary value.
Platform::current()->histogramEnumeration("AutodetectEncoding.Attempted", document.attemptedToDetermineEncodingFromContentSniffing(), 2);
if (document.encodingWasDetectedFromContentSniffing()) {
int encodingId = encodingToUmaId(document.encoding());
Platform::current()->histogramEnumeration("AutodetectEncoding.Detected", encodingId, WTF_ARRAY_LENGTH(kEncodingNames) + 1);
}
}
#endif
}
void WebViewImpl::didRemoveAllPendingStylesheet(WebLocalFrameImpl* webframe)

@ -137,6 +137,9 @@ public:
BLINK_EXPORT bool manifestUseCredentials() const;
BLINK_EXPORT WebDistillabilityFeatures distillabilityFeatures();
BLINK_EXPORT bool attemptedToDetermineEncodingFromContentSniffing() const;
BLINK_EXPORT bool encodingWasDetectedFromContentSniffing() const;
#if BLINK_IMPLEMENTATION
WebDocument(const PassRefPtrWillBeRawPtr<Document>&);
WebDocument& operator=(const PassRefPtrWillBeRawPtr<Document>&);

@ -2054,6 +2054,30 @@ http://cs/file:chrome/histograms.xml - but prefer this file for new entries.
</summary>
</histogram>
<histogram name="AutodetectEncoding.Attempted" enum="BooleanAttempted">
<owner>jinsukkim@chromium.org</owner>
<summary>
Whether the text encoding auto detection logic was attempted for a web page.
The logic is triggered when the parser fails to find the encoding method
from other signals such as http header, meta tag, BOM, etc.
If the logic successfully detects a new encoding method which is different
from the default one, the result is reported through
AutodetectEncoding.Detected with the encoding method (see below). Otherwise
- i.e. detection logic somehow fails to work for the page or the detected
one is same as the default - no result is reported.
</summary>
</histogram>
<histogram name="AutodetectEncoding.Detected" enum="EncodingMethod"
units="pages">
<owner>jinsukkim@chromium.org</owner>
<summary>
The number of web pages whose encoding method is found by the auto detection
logic. Grouped by the encoding methods defined in EncodingMethod.
</summary>
</histogram>
<histogram name="Autofill.AddressBook.AccessSkipped" enum="BooleanSkipped">
<obsolete>
Deprecated as of 8/2015.
@ -59528,6 +59552,46 @@ http://cs/file:chrome/histograms.xml - but prefer this file for new entries.
<int value="9" label="SCRIPT_READ_FINISHED"/>
</enum>
<enum name="EncodingMethod" type="int">
<int value="0" label="UNKNOWN"/>
<int value="1" label="Big5"/>
<int value="2" label="EUC-JP"/>
<int value="3" label="EUC-KR"/>
<int value="4" label="GBK"/>
<int value="5" label="IBM866"/>
<int value="6" label="ISO-2022-JP"/>
<int value="7" label="ISO-8859-10"/>
<int value="8" label="ISO-8859-13"/>
<int value="9" label="ISO-8859-14"/>
<int value="10" label="ISO-8859-15"/>
<int value="11" label="ISO-8859-16"/>
<int value="12" label="ISO-8859-2"/>
<int value="13" label="ISO-8859-3"/>
<int value="14" label="ISO-8859-4"/>
<int value="15" label="ISO-8859-5"/>
<int value="16" label="ISO-8859-6"/>
<int value="17" label="ISO-8859-7"/>
<int value="18" label="ISO-8859-8"/>
<int value="19" label="ISO-8859-8-I"/>
<int value="20" label="KOI8-R"/>
<int value="21" label="KOI8-U"/>
<int value="22" label="Shift_JIS"/>
<int value="23" label="UTF-16LE"/>
<int value="24" label="UTF-8"/>
<int value="25" label="gb18030"/>
<int value="26" label="macintosh"/>
<int value="27" label="windows-1250"/>
<int value="28" label="windows-1251"/>
<int value="29" label="windows-1252"/>
<int value="30" label="windows-1253"/>
<int value="31" label="windows-1254"/>
<int value="32" label="windows-1255"/>
<int value="33" label="windows-1256"/>
<int value="34" label="windows-1257"/>
<int value="35" label="windows-1258"/>
<int value="36" label="windows-874"/>
</enum>
<enum name="EnhancedBookmarkViewMode" type="int">
<obsolete>
Deprecated 9/2015.