Finch experiment: auto-detect text encoding
Experiment b/518968 aims to measure the impact of turning on auto-encoding detection on Chrome on Android by default. - Adds methods that tell us if: 1) auto-encoding detection was attempted, due to lacking encoding information from meta tag, header, BOM, etc. 2) auto-encoding detection successfully detected a new encoding which is different from a default one, hence would show the page being browsed which would otherwise have shown garbled text. - Selectively turns on text encoding auto-detection by default for experiment group. - Uploads histogram data on the auto-detection logic triggering rate and encoding method detected by the logic. The CL will be reverted once the experiment is finished. BUG=518968 Review URL: https://codereview.chromium.org/1456843002 Cr-Commit-Position: refs/heads/master@{#363716}
This commit is contained in:
chrome/browser/ui/prefs
third_party/WebKit
Source
core
dom
html
web
public
tools/metrics/histograms
@ -7,7 +7,9 @@
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
||||
#include "base/command_line.h"
|
||||
#include "base/memory/singleton.h"
|
||||
#include "base/metrics/field_trial.h"
|
||||
#include "base/prefs/overlay_user_pref_store.h"
|
||||
#include "base/prefs/pref_change_registrar.h"
|
||||
#include "base/prefs/pref_service.h"
|
||||
@ -318,6 +320,14 @@ void RegisterLocalizedFontPref(user_prefs::PrefRegistrySyncable* registry,
|
||||
registry->RegisterIntegerPref(path, val);
|
||||
}
|
||||
|
||||
bool IsAutodetectEncodingEnabledByDefault() {
|
||||
const std::string group_name = base::FieldTrialList::FindFullName(
|
||||
"AutodetectEncoding");
|
||||
return base::StartsWith(group_name,
|
||||
"Enabled",
|
||||
base::CompareCase::INSENSITIVE_ASCII);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
// Watching all these settings per tab is slow when a user has a lot of tabs and
|
||||
@ -581,9 +591,11 @@ void PrefsTabHelper::RegisterProfilePrefs(
|
||||
IDS_MINIMUM_FONT_SIZE);
|
||||
RegisterLocalizedFontPref(registry, prefs::kWebKitMinimumLogicalFontSize,
|
||||
IDS_MINIMUM_LOGICAL_FONT_SIZE);
|
||||
bool uses_universal_detector = IsAutodetectEncodingEnabledByDefault() ||
|
||||
l10n_util::GetStringUTF8(IDS_USES_UNIVERSAL_DETECTOR) == "true";
|
||||
registry->RegisterBooleanPref(
|
||||
prefs::kWebKitUsesUniversalDetector,
|
||||
l10n_util::GetStringUTF8(IDS_USES_UNIVERSAL_DETECTOR) == "true",
|
||||
uses_universal_detector,
|
||||
user_prefs::PrefRegistrySyncable::SYNCABLE_PREF);
|
||||
registry->RegisterStringPref(
|
||||
prefs::kStaticEncodings,
|
||||
|
10
third_party/WebKit/Source/core/dom/Document.cpp
vendored
10
third_party/WebKit/Source/core/dom/Document.cpp
vendored
@ -1556,6 +1556,16 @@ void Document::updateStyleInvalidationIfNeeded()
|
||||
styleEngine().styleInvalidator().invalidate(*this);
|
||||
}
|
||||
|
||||
bool Document::attemptedToDetermineEncodingFromContentSniffing() const
|
||||
{
|
||||
return m_encodingData.attemptedToDetermineEncodingFromContentSniffing();
|
||||
}
|
||||
|
||||
bool Document::encodingWasDetectedFromContentSniffing() const
|
||||
{
|
||||
return m_encodingData.encodingWasDetectedFromContentSniffing();
|
||||
}
|
||||
|
||||
void Document::setupFontBuilder(ComputedStyle& documentStyle)
|
||||
{
|
||||
FontBuilder fontBuilder(*this);
|
||||
|
@ -1002,6 +1002,9 @@ public:
|
||||
|
||||
void updateStyleInvalidationIfNeeded();
|
||||
|
||||
bool attemptedToDetermineEncodingFromContentSniffing() const;
|
||||
bool encodingWasDetectedFromContentSniffing() const;
|
||||
|
||||
DECLARE_VIRTUAL_TRACE();
|
||||
|
||||
bool hasSVGFilterElementsRequiringLayerUpdate() const { return m_layerUpdateSVGFilterElements.size(); }
|
||||
|
@ -46,6 +46,8 @@ DocumentEncodingData::DocumentEncodingData(const TextResourceDecoder& decoder)
|
||||
{
|
||||
m_encoding = decoder.encoding();
|
||||
m_wasDetectedHeuristically = decoder.encodingWasDetectedHeuristically();
|
||||
m_attemptedToDetermineEncodingFromContentSniffing = decoder.attemptedToDetermineEncodingFromContentSniffing();
|
||||
m_encodingWasDetectedFromContentSniffing = decoder.encodingWasDetectedFromContentSniffing();
|
||||
m_sawDecodingError = decoder.sawError();
|
||||
}
|
||||
|
||||
|
@ -48,10 +48,14 @@ public:
|
||||
void setEncoding(const WTF::TextEncoding&);
|
||||
bool wasDetectedHeuristically() const { return m_wasDetectedHeuristically; }
|
||||
bool sawDecodingError() const { return m_sawDecodingError; }
|
||||
bool attemptedToDetermineEncodingFromContentSniffing() const { return m_attemptedToDetermineEncodingFromContentSniffing; }
|
||||
bool encodingWasDetectedFromContentSniffing() const { return m_encodingWasDetectedFromContentSniffing; }
|
||||
|
||||
private:
|
||||
WTF::TextEncoding m_encoding;
|
||||
bool m_wasDetectedHeuristically;
|
||||
bool m_attemptedToDetermineEncodingFromContentSniffing;
|
||||
bool m_encodingWasDetectedFromContentSniffing;
|
||||
bool m_sawDecodingError;
|
||||
};
|
||||
|
||||
@ -62,6 +66,8 @@ inline bool operator!=(const DocumentEncodingData& a, const DocumentEncodingData
|
||||
{
|
||||
return a.encoding() != b.encoding()
|
||||
|| a.wasDetectedHeuristically() != b.wasDetectedHeuristically()
|
||||
|| a.attemptedToDetermineEncodingFromContentSniffing() != b.attemptedToDetermineEncodingFromContentSniffing()
|
||||
|| a.encodingWasDetectedFromContentSniffing() != b.encodingWasDetectedFromContentSniffing()
|
||||
|| a.sawDecodingError() != b.sawDecodingError();
|
||||
}
|
||||
|
||||
|
@ -19,7 +19,6 @@
|
||||
Boston, MA 02110-1301, USA.
|
||||
*/
|
||||
|
||||
|
||||
#include "config.h"
|
||||
#include "core/html/parser/TextResourceDecoder.h"
|
||||
|
||||
@ -401,9 +400,7 @@ String TextResourceDecoder::decode(const char* data, size_t len)
|
||||
checkForMetaCharset(dataForDecode, lengthForDecode);
|
||||
|
||||
if (shouldAutoDetect()) {
|
||||
WTF::TextEncoding detectedEncoding;
|
||||
if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
|
||||
setEncoding(detectedEncoding, EncodingFromContentSniffing);
|
||||
detectTextEncoding(data, len);
|
||||
}
|
||||
|
||||
ASSERT(m_encoding.isValid());
|
||||
@ -417,6 +414,16 @@ String TextResourceDecoder::decode(const char* data, size_t len)
|
||||
return result;
|
||||
}
|
||||
|
||||
void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)
|
||||
{
|
||||
WTF::TextEncoding detectedEncoding;
|
||||
bool detected = blink::detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding);
|
||||
if (detected && detectedEncoding != encoding())
|
||||
setEncoding(detectedEncoding, EncodingFromContentSniffing);
|
||||
else
|
||||
setEncoding(detectedEncoding, DefaultEncodingAttemptedSniffing);
|
||||
}
|
||||
|
||||
String TextResourceDecoder::flush()
|
||||
{
|
||||
// If we can not identify the encoding even after a document is completely
|
||||
@ -424,9 +431,7 @@ String TextResourceDecoder::flush()
|
||||
// autodetection is satisfied.
|
||||
if (m_buffer.size() && shouldAutoDetect()
|
||||
&& ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_contentType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSContent)))) {
|
||||
WTF::TextEncoding detectedEncoding;
|
||||
if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
|
||||
setEncoding(detectedEncoding, EncodingFromContentSniffing);
|
||||
detectTextEncoding(m_buffer.data(), m_buffer.size());
|
||||
}
|
||||
|
||||
if (!m_codec)
|
||||
|
@ -37,6 +37,7 @@ class CORE_EXPORT TextResourceDecoder {
|
||||
public:
|
||||
enum EncodingSource {
|
||||
DefaultEncoding,
|
||||
DefaultEncodingAttemptedSniffing,
|
||||
AutoDetectedEncoding,
|
||||
EncodingFromContentSniffing,
|
||||
EncodingFromXMLHeader,
|
||||
@ -57,6 +58,18 @@ public:
|
||||
bool encodingWasDetectedHeuristically() const
|
||||
{
|
||||
return m_source == AutoDetectedEncoding
|
||||
|| m_source == EncodingFromContentSniffing
|
||||
|| m_source == DefaultEncodingAttemptedSniffing;
|
||||
}
|
||||
|
||||
bool encodingWasDetectedFromContentSniffing() const
|
||||
{
|
||||
return m_source == EncodingFromContentSniffing;
|
||||
}
|
||||
|
||||
bool attemptedToDetermineEncodingFromContentSniffing() const
|
||||
{
|
||||
return m_source == DefaultEncodingAttemptedSniffing
|
||||
|| m_source == EncodingFromContentSniffing;
|
||||
}
|
||||
|
||||
@ -83,6 +96,7 @@ private:
|
||||
bool checkForXMLCharset(const char*, size_t, bool& movedDataToBuffer);
|
||||
void checkForMetaCharset(const char*, size_t);
|
||||
bool shouldAutoDetect() const;
|
||||
void detectTextEncoding(const char*, size_t);
|
||||
|
||||
ContentType m_contentType;
|
||||
WTF::TextEncoding m_encoding;
|
||||
|
12
third_party/WebKit/Source/web/WebDocument.cpp
vendored
12
third_party/WebKit/Source/web/WebDocument.cpp
vendored
@ -322,6 +322,18 @@ WebDistillabilityFeatures WebDocument::distillabilityFeatures()
|
||||
return DocumentStatisticsCollector::collectStatistics(*unwrap<Document>());
|
||||
}
|
||||
|
||||
bool WebDocument::attemptedToDetermineEncodingFromContentSniffing() const
|
||||
{
|
||||
const Document* document = constUnwrap<Document>();
|
||||
return document->attemptedToDetermineEncodingFromContentSniffing();
|
||||
}
|
||||
|
||||
bool WebDocument::encodingWasDetectedFromContentSniffing() const
|
||||
{
|
||||
const Document* document = constUnwrap<Document>();
|
||||
return document->encodingWasDetectedFromContentSniffing();
|
||||
}
|
||||
|
||||
WebDocument::WebDocument(const PassRefPtrWillBeRawPtr<Document>& elem)
|
||||
: WebNode(elem)
|
||||
{
|
||||
|
75
third_party/WebKit/Source/web/WebViewImpl.cpp
vendored
75
third_party/WebKit/Source/web/WebViewImpl.cpp
vendored
@ -323,6 +323,69 @@ private:
|
||||
WebColor m_color;
|
||||
};
|
||||
|
||||
#if OS(ANDROID)
|
||||
// Array used to convert canonical encoding method name to index to be
|
||||
// uploaded to UMA for the experiment on text encoding auto detection.
|
||||
// The listed order should be in sync with the enum definition 'EncodingMethod'
|
||||
// in tools/metrics/histograms/histograms.xml.
|
||||
static const char* kEncodingNames[] = {
|
||||
"UNKNOWN",
|
||||
"Big5",
|
||||
"EUC-JP",
|
||||
"EUC-KR",
|
||||
"GBK",
|
||||
"IBM866",
|
||||
"ISO-2022-JP",
|
||||
"ISO-8859-10",
|
||||
"ISO-8859-13",
|
||||
"ISO-8859-14",
|
||||
"ISO-8859-15",
|
||||
"ISO-8859-16",
|
||||
"ISO-8859-2",
|
||||
"ISO-8859-3",
|
||||
"ISO-8859-4",
|
||||
"ISO-8859-5",
|
||||
"ISO-8859-6",
|
||||
"ISO-8859-7",
|
||||
"ISO-8859-8",
|
||||
"ISO-8859-8-I",
|
||||
"KOI8-R",
|
||||
"KOI8-U",
|
||||
"Shift_JIS",
|
||||
"UTF-16LE",
|
||||
"UTF-8",
|
||||
"gb18030",
|
||||
"macintosh",
|
||||
"windows-1250",
|
||||
"windows-1251",
|
||||
"windows-1252",
|
||||
"windows-1253",
|
||||
"windows-1254",
|
||||
"windows-1255",
|
||||
"windows-1256",
|
||||
"windows-1257",
|
||||
"windows-1258",
|
||||
"windows-874"
|
||||
};
|
||||
|
||||
// Returns the index of the entry in the array that matches
|
||||
// the given encoding method.
|
||||
static int encodingToUmaId(const WTF::TextEncoding& encoding)
|
||||
{
|
||||
const char* encodingName = encoding.name();
|
||||
for (size_t i = 0; i < WTF_ARRAY_LENGTH(kEncodingNames); ++i) {
|
||||
if (!strcasecmp(kEncodingNames[i], encodingName))
|
||||
return i;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool isInternalURL(const KURL& url)
|
||||
{
|
||||
const String& protocol = url.protocol();
|
||||
return protocol == "chrome" || protocol == "chrome-native" || protocol == "swappedout";
|
||||
}
|
||||
#endif
|
||||
} // namespace
|
||||
|
||||
// WebView ----------------------------------------------------------------
|
||||
@ -3957,6 +4020,18 @@ void WebViewImpl::didFinishDocumentLoad(WebLocalFrameImpl* webframe)
|
||||
if (webframe != mainFrameImpl())
|
||||
return;
|
||||
resumeTreeViewCommitsIfRenderingReady();
|
||||
#if OS(ANDROID)
|
||||
if (!isInternalURL(webframe->frame()->document()->baseURL()) && page()->settings().usesEncodingDetector()) {
|
||||
const Document& document = *webframe->frame()->document();
|
||||
|
||||
// "AutodetectEncoding.Attempted" is of boolean type - either 0 or 1. Use 2 for the boundary value.
|
||||
Platform::current()->histogramEnumeration("AutodetectEncoding.Attempted", document.attemptedToDetermineEncodingFromContentSniffing(), 2);
|
||||
if (document.encodingWasDetectedFromContentSniffing()) {
|
||||
int encodingId = encodingToUmaId(document.encoding());
|
||||
Platform::current()->histogramEnumeration("AutodetectEncoding.Detected", encodingId, WTF_ARRAY_LENGTH(kEncodingNames) + 1);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
void WebViewImpl::didRemoveAllPendingStylesheet(WebLocalFrameImpl* webframe)
|
||||
|
3
third_party/WebKit/public/web/WebDocument.h
vendored
3
third_party/WebKit/public/web/WebDocument.h
vendored
@ -137,6 +137,9 @@ public:
|
||||
BLINK_EXPORT bool manifestUseCredentials() const;
|
||||
BLINK_EXPORT WebDistillabilityFeatures distillabilityFeatures();
|
||||
|
||||
BLINK_EXPORT bool attemptedToDetermineEncodingFromContentSniffing() const;
|
||||
BLINK_EXPORT bool encodingWasDetectedFromContentSniffing() const;
|
||||
|
||||
#if BLINK_IMPLEMENTATION
|
||||
WebDocument(const PassRefPtrWillBeRawPtr<Document>&);
|
||||
WebDocument& operator=(const PassRefPtrWillBeRawPtr<Document>&);
|
||||
|
@ -2054,6 +2054,30 @@ http://cs/file:chrome/histograms.xml - but prefer this file for new entries.
|
||||
</summary>
|
||||
</histogram>
|
||||
|
||||
<histogram name="AutodetectEncoding.Attempted" enum="BooleanAttempted">
|
||||
<owner>jinsukkim@chromium.org</owner>
|
||||
<summary>
|
||||
Whether the text encoding auto detection logic was attempted for a web page.
|
||||
The logic is triggered when the parser fails to find the encoding method
|
||||
from other signals such as http header, meta tag, BOM, etc.
|
||||
|
||||
If the logic successfully detects a new encoding method which is different
|
||||
from the default one, the result is reported through
|
||||
AutodetectEncoding.Detected with the encoding method (see below). Otherwise
|
||||
- i.e. detection logic somehow fails to work for the page or the detected
|
||||
one is same as the default - no result is reported.
|
||||
</summary>
|
||||
</histogram>
|
||||
|
||||
<histogram name="AutodetectEncoding.Detected" enum="EncodingMethod"
|
||||
units="pages">
|
||||
<owner>jinsukkim@chromium.org</owner>
|
||||
<summary>
|
||||
The number of web pages whose encoding method is found by the auto detection
|
||||
logic. Grouped by the encoding methods defined in EncodingMethod.
|
||||
</summary>
|
||||
</histogram>
|
||||
|
||||
<histogram name="Autofill.AddressBook.AccessSkipped" enum="BooleanSkipped">
|
||||
<obsolete>
|
||||
Deprecated as of 8/2015.
|
||||
@ -59528,6 +59552,46 @@ http://cs/file:chrome/histograms.xml - but prefer this file for new entries.
|
||||
<int value="9" label="SCRIPT_READ_FINISHED"/>
|
||||
</enum>
|
||||
|
||||
<enum name="EncodingMethod" type="int">
|
||||
<int value="0" label="UNKNOWN"/>
|
||||
<int value="1" label="Big5"/>
|
||||
<int value="2" label="EUC-JP"/>
|
||||
<int value="3" label="EUC-KR"/>
|
||||
<int value="4" label="GBK"/>
|
||||
<int value="5" label="IBM866"/>
|
||||
<int value="6" label="ISO-2022-JP"/>
|
||||
<int value="7" label="ISO-8859-10"/>
|
||||
<int value="8" label="ISO-8859-13"/>
|
||||
<int value="9" label="ISO-8859-14"/>
|
||||
<int value="10" label="ISO-8859-15"/>
|
||||
<int value="11" label="ISO-8859-16"/>
|
||||
<int value="12" label="ISO-8859-2"/>
|
||||
<int value="13" label="ISO-8859-3"/>
|
||||
<int value="14" label="ISO-8859-4"/>
|
||||
<int value="15" label="ISO-8859-5"/>
|
||||
<int value="16" label="ISO-8859-6"/>
|
||||
<int value="17" label="ISO-8859-7"/>
|
||||
<int value="18" label="ISO-8859-8"/>
|
||||
<int value="19" label="ISO-8859-8-I"/>
|
||||
<int value="20" label="KOI8-R"/>
|
||||
<int value="21" label="KOI8-U"/>
|
||||
<int value="22" label="Shift_JIS"/>
|
||||
<int value="23" label="UTF-16LE"/>
|
||||
<int value="24" label="UTF-8"/>
|
||||
<int value="25" label="gb18030"/>
|
||||
<int value="26" label="macintosh"/>
|
||||
<int value="27" label="windows-1250"/>
|
||||
<int value="28" label="windows-1251"/>
|
||||
<int value="29" label="windows-1252"/>
|
||||
<int value="30" label="windows-1253"/>
|
||||
<int value="31" label="windows-1254"/>
|
||||
<int value="32" label="windows-1255"/>
|
||||
<int value="33" label="windows-1256"/>
|
||||
<int value="34" label="windows-1257"/>
|
||||
<int value="35" label="windows-1258"/>
|
||||
<int value="36" label="windows-874"/>
|
||||
</enum>
|
||||
|
||||
<enum name="EnhancedBookmarkViewMode" type="int">
|
||||
<obsolete>
|
||||
Deprecated 9/2015.
|
||||
|
Reference in New Issue
Block a user