Replace RE2 import with a dependency
TBR=armansito@chromium.org,reillyg@chromium.org,piman@chromium.org BUG=568119 Review URL: https://codereview.chromium.org/1544433002 Cr-Commit-Position: refs/heads/master@{#366412}
This commit is contained in:
.gitignoreDEPS
chrome/browser
android
data_usage
chromeos
download
extensions
activity_log
password_manager
search
ui
webui
extensions
components
autofill
dom_distiller
core
drive
json_schema
keyed_service
password_manager
core
browser
import
plugins
renderer
policy
core
common
url_matcher
webcrypto
algorithms
content/shell/android/linker_test_apk
device/serial
extensions
browser
api
common
gpu
third_party
leveldatabase
re2
.gitignoreAUTHORSBUILDBUILD.gnCMakeLists.txtCONTRIBUTING.mdCONTRIBUTORSMakefileREADMEWORKSPACE
benchlog
doc
lib
libre2.symbolslibre2.symbols.darwinre2.gypre2.pcre2
bitstate.cccompile.ccdfa.ccfiltered_re2.ccfiltered_re2.hmake_perl_groups.plmake_unicode_casefold.pymake_unicode_groups.pymimics_pcre.ccnfa.cconepass.ccparse.ccperl_groups.ccprefilter.ccprefilter.hprefilter_tree.ccprefilter_tree.hprog.ccprog.hre2.ccre2.hregexp.ccregexp.hset.ccset.hsimplify.ccstringpiece.ccstringpiece.h
re2_test.bzlrunteststestinstall.ccucs2.difftesting
backtrack.cccharclass_test.cccompile_test.ccdfa_test.ccdump.ccexhaustive1_test.ccexhaustive2_test.ccexhaustive3_test.ccexhaustive_test.ccexhaustive_tester.ccexhaustive_tester.hfiltered_re2_test.ccmimics_pcre_test.ccnull_walker.ccparse_test.ccpossible_match_test.ccrandom_test.ccre2_arg_test.ccre2_test.ccregexp_benchmark.ccregexp_generator.ccregexp_generator.hregexp_test.ccrequired_prefix_test.ccsearch_test.ccset_test.ccsimplify_test.ccstring_generator.ccstring_generator.hstring_generator_test.cctester.cctester.h
tostring.ccunicode.pyunicode_casefold.ccunicode_casefold.hunicode_groups.ccunicode_groups.hvariadic_function.hwalker-inl.hutil
tools/ipc_fuzzer/message_tools
1
.gitignore
vendored
1
.gitignore
vendored
@ -388,6 +388,7 @@ vs-chromium-project.txt
|
||||
/third_party/python_26
|
||||
/third_party/pywebsocket/src
|
||||
/third_party/pywebsocket/src
|
||||
/third_party/re2/src
|
||||
/third_party/requests/src
|
||||
/third_party/robolectric/lib
|
||||
/third_party/safe_browsing/testing
|
||||
|
3
DEPS
3
DEPS
@ -275,6 +275,9 @@ deps = {
|
||||
|
||||
'src/third_party/openh264/src':
|
||||
Var('chromium_git') + '/external/github.com/cisco/openh264' + '@' + 'b37cda248234162033e3e11b0335f3131cdfe488',
|
||||
|
||||
'src/third_party/re2/src':
|
||||
Var('chromium_git') + '/external/github.com/google/re2.git' + '@' + 'dba3349aba83b5588e85e5ecf2b56c97f2d259b7',
|
||||
}
|
||||
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
#include "base/time/default_tick_clock.h"
|
||||
#include "base/time/tick_clock.h"
|
||||
#include "chrome/browser/android/data_usage/external_data_use_observer.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "url/gurl.h"
|
||||
|
||||
namespace chrome {
|
||||
|
@ -14,7 +14,7 @@
|
||||
#include "chromeos/chromeos_switches.h"
|
||||
#include "chromeos/system/statistics_provider.h"
|
||||
#include "content/public/common/content_switches.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "third_party/zlib/zlib.h"
|
||||
|
||||
namespace {
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include "components/policy/core/common/cloud/enterprise_metrics.h"
|
||||
#include "content/public/browser/browser_thread.h"
|
||||
#include "net/http/http_request_headers.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace {
|
||||
// The maximum number of successive retries.
|
||||
|
@ -31,7 +31,7 @@
|
||||
#include "components/url_formatter/url_formatter.h"
|
||||
#include "content/public/browser/content_browser_client.h"
|
||||
#include "content/public/browser/download_item.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "url/gurl.h"
|
||||
|
||||
using content::DownloadDangerType;
|
||||
|
@ -36,7 +36,7 @@
|
||||
#include "extensions/browser/extensions_browser_client.h"
|
||||
#include "extensions/common/extension.h"
|
||||
#include "extensions/common/one_shot_event.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "url/gurl.h"
|
||||
|
||||
namespace constants = activity_log_constants;
|
||||
|
@ -53,7 +53,7 @@
|
||||
#include "content/public/browser/web_contents.h"
|
||||
#include "google_apis/gaia/gaia_urls.h"
|
||||
#include "net/base/url_util.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
#if defined(OS_MACOSX) || BUILDFLAG(ANDROID_JAVA_UI)
|
||||
#include "chrome/browser/password_manager/save_password_infobar_delegate.h"
|
||||
|
@ -18,8 +18,8 @@
|
||||
#include "chrome/common/url_constants.h"
|
||||
#include "content/public/browser/browser_thread.h"
|
||||
#include "content/public/browser/url_data_source.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/re2/stringpiece.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/stringpiece.h"
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -25,7 +25,7 @@
|
||||
#include "extensions/common/constants.h"
|
||||
#include "extensions/common/extension.h"
|
||||
#include "extensions/common/manifest_constants.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "ui/base/l10n/l10n_util.h"
|
||||
|
||||
namespace extensions {
|
||||
|
@ -22,7 +22,7 @@
|
||||
#include "third_party/WebKit/public/web/WebFormControlElement.h"
|
||||
#include "third_party/WebKit/public/web/WebFrame.h"
|
||||
#include "third_party/WebKit/public/web/WebInputElement.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
using blink::WebDocument;
|
||||
using blink::WebFormControlElement;
|
||||
|
@ -34,7 +34,7 @@
|
||||
#include "components/rappor/rappor_service.h"
|
||||
#include "components/rappor/rappor_utils.h"
|
||||
#include "third_party/libxml/chromium/libxml_utils.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace autofill {
|
||||
namespace {
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <string>
|
||||
|
||||
#include "base/json/json_reader.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "url/gurl.h"
|
||||
|
||||
namespace dom_distiller {
|
||||
|
@ -18,7 +18,7 @@
|
||||
#include "google_apis/drive/drive_api_parser.h"
|
||||
#include "net/base/escape.h"
|
||||
#include "net/base/net_errors.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "url/gurl.h"
|
||||
|
||||
namespace drive {
|
||||
|
@ -17,7 +17,7 @@
|
||||
#include "base/strings/stringprintf.h"
|
||||
#include "base/values.h"
|
||||
#include "components/json_schema/json_schema_constants.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace schema = json_schema_constants;
|
||||
|
||||
|
@ -8,7 +8,7 @@
|
||||
#include "components/keyed_service/core/dependency_graph.h"
|
||||
#include "components/keyed_service/core/dependency_node.h"
|
||||
#include "testing/gtest/include/gtest/gtest.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
#include "base/logging.h"
|
||||
#include "base/strings/string_util.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace {
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
#include "gin/object_template_builder.h"
|
||||
#include "third_party/WebKit/public/web/WebElement.h"
|
||||
#include "third_party/WebKit/public/web/WebPluginContainer.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace plugins {
|
||||
|
||||
|
@ -17,7 +17,7 @@
|
||||
#include "components/json_schema/json_schema_constants.h"
|
||||
#include "components/json_schema/json_schema_validator.h"
|
||||
#include "components/policy/core/common/schema_internal.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace schema = json_schema_constants;
|
||||
|
||||
|
@ -8,8 +8,8 @@
|
||||
#include "base/stl_util.h"
|
||||
#include "base/strings/string_util.h"
|
||||
#include "components/url_matcher/substring_set_matcher.h"
|
||||
#include "third_party/re2/re2/filtered_re2.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/filtered_re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace url_matcher {
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include "base/values.h"
|
||||
#include "components/url_matcher/url_matcher_constants.h"
|
||||
#include "components/url_matcher/url_matcher_helpers.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace url_matcher {
|
||||
|
||||
|
@ -23,7 +23,7 @@
|
||||
#include "components/webcrypto/status.h"
|
||||
#include "third_party/WebKit/public/platform/WebCryptoAlgorithmParams.h"
|
||||
#include "third_party/WebKit/public/platform/WebCryptoKeyAlgorithm.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace webcrypto {
|
||||
|
||||
|
@ -20,7 +20,7 @@
|
||||
#include "base/logging.h"
|
||||
#include "base/strings/stringprintf.h"
|
||||
#include "jni/LinkerTests_jni.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace content {
|
||||
|
||||
|
@ -16,7 +16,7 @@
|
||||
#include "base/strings/stringprintf.h"
|
||||
#include "base/strings/utf_string_conversions.h"
|
||||
#include "base/win/registry.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace device {
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include "base/threading/thread_checker.h"
|
||||
#include "device/core/device_info_query_win.h"
|
||||
#include "device/core/device_monitor_win.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace device {
|
||||
|
||||
|
@ -28,7 +28,7 @@
|
||||
#include "net/base/registry_controlled_domains/registry_controlled_domain.h"
|
||||
#include "net/http/http_util.h"
|
||||
#include "net/url_request/url_request.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
using content::ResourceRequestInfo;
|
||||
|
||||
|
@ -13,7 +13,7 @@
|
||||
#include "base/values.h"
|
||||
#include "net/base/escape.h"
|
||||
#include "net/url_request/url_request.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
using base::DictionaryValue;
|
||||
using base::ListValue;
|
||||
|
@ -10,7 +10,7 @@
|
||||
#include "base/strings/string_number_conversions.h"
|
||||
#include "base/strings/string_util.h"
|
||||
#include "base/strings/stringprintf.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "third_party/skia/include/core/SkColor.h"
|
||||
#include "ui/gfx/color_utils.h"
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
#include <string>
|
||||
|
||||
#include "base/strings/utf_string_conversions.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace extensions {
|
||||
|
||||
|
@ -29,7 +29,7 @@
|
||||
#include "gpu/command_buffer/service/gpu_switches.h"
|
||||
#include "gpu/command_buffer/service/program_cache.h"
|
||||
#include "gpu/command_buffer/service/shader_manager.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
#include "ui/gl/gl_version_info.h"
|
||||
|
||||
using base::TimeDelta;
|
||||
|
@ -17,7 +17,7 @@
|
||||
#include "base/sys_info.h"
|
||||
#include "gpu/config/gpu_info.h"
|
||||
#include "gpu/config/gpu_util.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
namespace gpu {
|
||||
namespace {
|
||||
|
@ -5,7 +5,7 @@
|
||||
#include "gpu/config/gpu_info_collector.h"
|
||||
|
||||
// This has to be included before windows.h.
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
#include <windows.h>
|
||||
#include <cfgmgr32.h>
|
||||
|
2
third_party/leveldatabase/env_chromium.cc
vendored
2
third_party/leveldatabase/env_chromium.cc
vendored
@ -19,7 +19,7 @@
|
||||
#include "base/threading/thread_restrictions.h"
|
||||
#include "base/trace_event/trace_event.h"
|
||||
#include "third_party/leveldatabase/chromium_logger.h"
|
||||
#include "third_party/re2/re2/re2.h"
|
||||
#include "third_party/re2/src/re2/re2.h"
|
||||
|
||||
using base::FilePath;
|
||||
using leveldb::FileLock;
|
||||
|
5
third_party/re2/.gitignore
vendored
5
third_party/re2/.gitignore
vendored
@ -1,5 +0,0 @@
|
||||
*.pyc
|
||||
*.orig
|
||||
core
|
||||
obj/
|
||||
benchlog.*
|
13
third_party/re2/AUTHORS
vendored
13
third_party/re2/AUTHORS
vendored
@ -1,13 +0,0 @@
|
||||
# This is the official list of RE2 authors for copyright purposes.
|
||||
# This file is distinct from the CONTRIBUTORS files.
|
||||
# See the latter for an explanation.
|
||||
|
||||
# Names should be added to this file as
|
||||
# Name or Organization <email address>
|
||||
# The email address is not required for organizations.
|
||||
|
||||
# Please keep the list sorted.
|
||||
|
||||
Google Inc.
|
||||
Samsung Electronics
|
||||
Stefano Rivera <stefano.rivera@gmail.com>
|
121
third_party/re2/BUILD
vendored
121
third_party/re2/BUILD
vendored
@ -1,121 +0,0 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.io/) BUILD file for RE2.
|
||||
|
||||
licenses(["notice"])
|
||||
|
||||
cc_library(
|
||||
name = "re2",
|
||||
srcs = [
|
||||
"re2/bitstate.cc",
|
||||
"re2/compile.cc",
|
||||
"re2/dfa.cc",
|
||||
"re2/filtered_re2.cc",
|
||||
"re2/mimics_pcre.cc",
|
||||
"re2/nfa.cc",
|
||||
"re2/onepass.cc",
|
||||
"re2/parse.cc",
|
||||
"re2/perl_groups.cc",
|
||||
"re2/prefilter.cc",
|
||||
"re2/prefilter.h",
|
||||
"re2/prefilter_tree.cc",
|
||||
"re2/prefilter_tree.h",
|
||||
"re2/prog.cc",
|
||||
"re2/prog.h",
|
||||
"re2/re2.cc",
|
||||
"re2/regexp.cc",
|
||||
"re2/regexp.h",
|
||||
"re2/set.cc",
|
||||
"re2/simplify.cc",
|
||||
"re2/stringpiece.cc",
|
||||
"re2/tostring.cc",
|
||||
"re2/unicode_casefold.cc",
|
||||
"re2/unicode_casefold.h",
|
||||
"re2/unicode_groups.cc",
|
||||
"re2/unicode_groups.h",
|
||||
"re2/walker-inl.h",
|
||||
"util/atomicops.h",
|
||||
"util/flags.h",
|
||||
"util/hash.cc",
|
||||
"util/logging.cc",
|
||||
"util/logging.h",
|
||||
"util/mutex.h",
|
||||
"util/rune.cc",
|
||||
"util/sparse_array.h",
|
||||
"util/sparse_set.h",
|
||||
"util/stringprintf.cc",
|
||||
"util/strutil.cc",
|
||||
"util/utf.h",
|
||||
"util/util.h",
|
||||
"util/valgrind.cc",
|
||||
"util/valgrind.h",
|
||||
],
|
||||
hdrs = [
|
||||
"re2/filtered_re2.h",
|
||||
"re2/re2.h",
|
||||
"re2/set.h",
|
||||
"re2/stringpiece.h",
|
||||
"re2/variadic_function.h",
|
||||
],
|
||||
includes = ["."],
|
||||
linkopts = ["-pthread"],
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "test",
|
||||
testonly = 1,
|
||||
srcs = [
|
||||
"re2/testing/backtrack.cc",
|
||||
"re2/testing/dump.cc",
|
||||
"re2/testing/exhaustive_tester.cc",
|
||||
"re2/testing/null_walker.cc",
|
||||
"re2/testing/regexp_generator.cc",
|
||||
"re2/testing/string_generator.cc",
|
||||
"re2/testing/tester.cc",
|
||||
"util/pcre.cc",
|
||||
"util/random.cc",
|
||||
"util/test.cc",
|
||||
"util/thread.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"re2/testing/exhaustive_tester.h",
|
||||
"re2/testing/regexp_generator.h",
|
||||
"re2/testing/string_generator.h",
|
||||
"re2/testing/tester.h",
|
||||
"util/pcre.h",
|
||||
"util/random.h",
|
||||
"util/test.h",
|
||||
"util/thread.h",
|
||||
],
|
||||
includes = ["."],
|
||||
deps = [":re2"],
|
||||
)
|
||||
|
||||
load("re2_test", "re2_test")
|
||||
|
||||
re2_test("charclass_test")
|
||||
re2_test("compile_test")
|
||||
re2_test("filtered_re2_test")
|
||||
re2_test("mimics_pcre_test")
|
||||
re2_test("parse_test")
|
||||
re2_test("possible_match_test")
|
||||
re2_test("re2_test")
|
||||
re2_test("re2_arg_test")
|
||||
re2_test("regexp_test")
|
||||
re2_test("required_prefix_test")
|
||||
re2_test("search_test")
|
||||
re2_test("set_test")
|
||||
re2_test("simplify_test")
|
||||
re2_test("string_generator_test")
|
||||
|
||||
re2_test("dfa_test")
|
||||
re2_test("exhaustive1_test")
|
||||
re2_test("exhaustive2_test")
|
||||
re2_test("exhaustive3_test")
|
||||
re2_test("exhaustive_test")
|
||||
re2_test("random_test")
|
||||
|
||||
# TODO: Add support for regexp_benchmark.
|
96
third_party/re2/BUILD.gn
vendored
96
third_party/re2/BUILD.gn
vendored
@ -3,58 +3,58 @@
|
||||
# found in the LICENSE file.
|
||||
|
||||
config("re2_config") {
|
||||
include_dirs = [ "." ]
|
||||
include_dirs = [ "src" ]
|
||||
}
|
||||
|
||||
static_library("re2") {
|
||||
sources = [
|
||||
"re2/bitstate.cc",
|
||||
"re2/compile.cc",
|
||||
"re2/dfa.cc",
|
||||
"re2/filtered_re2.cc",
|
||||
"re2/filtered_re2.h",
|
||||
"re2/mimics_pcre.cc",
|
||||
"re2/nfa.cc",
|
||||
"re2/onepass.cc",
|
||||
"re2/parse.cc",
|
||||
"re2/perl_groups.cc",
|
||||
"re2/prefilter.cc",
|
||||
"re2/prefilter.h",
|
||||
"re2/prefilter_tree.cc",
|
||||
"re2/prefilter_tree.h",
|
||||
"re2/prog.cc",
|
||||
"re2/prog.h",
|
||||
"re2/re2.cc",
|
||||
"re2/re2.h",
|
||||
"re2/regexp.cc",
|
||||
"re2/regexp.h",
|
||||
"re2/set.cc",
|
||||
"re2/set.h",
|
||||
"re2/simplify.cc",
|
||||
"re2/stringpiece.cc",
|
||||
"re2/stringpiece.h",
|
||||
"re2/tostring.cc",
|
||||
"re2/unicode_casefold.cc",
|
||||
"re2/unicode_casefold.h",
|
||||
"re2/unicode_groups.cc",
|
||||
"re2/unicode_groups.h",
|
||||
"re2/variadic_function.h",
|
||||
"re2/walker-inl.h",
|
||||
"util/atomicops.h",
|
||||
"util/flags.h",
|
||||
"util/hash.cc",
|
||||
"util/logging.cc",
|
||||
"util/logging.h",
|
||||
"util/mutex.h",
|
||||
"util/rune.cc",
|
||||
"util/sparse_array.h",
|
||||
"util/sparse_set.h",
|
||||
"util/stringprintf.cc",
|
||||
"util/strutil.cc",
|
||||
"util/utf.h",
|
||||
"util/util.h",
|
||||
"util/valgrind.cc",
|
||||
"util/valgrind.h",
|
||||
"src/re2/bitstate.cc",
|
||||
"src/re2/compile.cc",
|
||||
"src/re2/dfa.cc",
|
||||
"src/re2/filtered_re2.cc",
|
||||
"src/re2/filtered_re2.h",
|
||||
"src/re2/mimics_pcre.cc",
|
||||
"src/re2/nfa.cc",
|
||||
"src/re2/onepass.cc",
|
||||
"src/re2/parse.cc",
|
||||
"src/re2/perl_groups.cc",
|
||||
"src/re2/prefilter.cc",
|
||||
"src/re2/prefilter.h",
|
||||
"src/re2/prefilter_tree.cc",
|
||||
"src/re2/prefilter_tree.h",
|
||||
"src/re2/prog.cc",
|
||||
"src/re2/prog.h",
|
||||
"src/re2/re2.cc",
|
||||
"src/re2/re2.h",
|
||||
"src/re2/regexp.cc",
|
||||
"src/re2/regexp.h",
|
||||
"src/re2/set.cc",
|
||||
"src/re2/set.h",
|
||||
"src/re2/simplify.cc",
|
||||
"src/re2/stringpiece.cc",
|
||||
"src/re2/stringpiece.h",
|
||||
"src/re2/tostring.cc",
|
||||
"src/re2/unicode_casefold.cc",
|
||||
"src/re2/unicode_casefold.h",
|
||||
"src/re2/unicode_groups.cc",
|
||||
"src/re2/unicode_groups.h",
|
||||
"src/re2/variadic_function.h",
|
||||
"src/re2/walker-inl.h",
|
||||
"src/util/atomicops.h",
|
||||
"src/util/flags.h",
|
||||
"src/util/hash.cc",
|
||||
"src/util/logging.cc",
|
||||
"src/util/logging.h",
|
||||
"src/util/mutex.h",
|
||||
"src/util/rune.cc",
|
||||
"src/util/sparse_array.h",
|
||||
"src/util/sparse_set.h",
|
||||
"src/util/stringprintf.cc",
|
||||
"src/util/strutil.cc",
|
||||
"src/util/utf.h",
|
||||
"src/util/util.h",
|
||||
"src/util/valgrind.cc",
|
||||
"src/util/valgrind.h",
|
||||
]
|
||||
|
||||
configs -= [ "//build/config/compiler:chromium_code" ]
|
||||
|
112
third_party/re2/CMakeLists.txt
vendored
112
third_party/re2/CMakeLists.txt
vendored
@ -1,112 +0,0 @@
|
||||
# Copyright 2015 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Old enough to support Ubuntu Precise.
|
||||
cmake_minimum_required(VERSION 2.8.7)
|
||||
|
||||
project(RE2 CXX)
|
||||
option(BUILD_SHARED_LIBS "build shared libraries" OFF)
|
||||
option(USEPCRE "use PCRE in tests and benchmarks" OFF)
|
||||
|
||||
set(EXTRA_TARGET_LINK_LIBRARIES)
|
||||
|
||||
if(WIN32)
|
||||
add_definitions(-DUNICODE -D_UNICODE -DSTRICT -DNOMINMAX)
|
||||
set(THREADING threadwin)
|
||||
else()
|
||||
set(THREADING thread)
|
||||
list(APPEND EXTRA_TARGET_LINK_LIBRARIES -pthread)
|
||||
endif()
|
||||
|
||||
if(USEPCRE)
|
||||
add_definitions(-DUSEPCRE)
|
||||
list(APPEND EXTRA_TARGET_LINK_LIBRARIES pcre)
|
||||
endif()
|
||||
|
||||
include_directories(${CMAKE_SOURCE_DIR})
|
||||
|
||||
set(RE2_LIBRARY_SOURCES
|
||||
re2/bitstate.cc
|
||||
re2/compile.cc
|
||||
re2/dfa.cc
|
||||
re2/filtered_re2.cc
|
||||
re2/mimics_pcre.cc
|
||||
re2/nfa.cc
|
||||
re2/onepass.cc
|
||||
re2/parse.cc
|
||||
re2/perl_groups.cc
|
||||
re2/prefilter.cc
|
||||
re2/prefilter_tree.cc
|
||||
re2/prog.cc
|
||||
re2/re2.cc
|
||||
re2/regexp.cc
|
||||
re2/set.cc
|
||||
re2/simplify.cc
|
||||
re2/stringpiece.cc
|
||||
re2/tostring.cc
|
||||
re2/unicode_casefold.cc
|
||||
re2/unicode_groups.cc
|
||||
util/hash.cc
|
||||
util/logging.cc
|
||||
util/rune.cc
|
||||
util/stringprintf.cc
|
||||
util/strutil.cc
|
||||
util/valgrind.cc
|
||||
)
|
||||
|
||||
add_library(re2 ${RE2_LIBRARY_SOURCES})
|
||||
|
||||
set(TEST_LIBRARY_SOURCES
|
||||
re2/testing/backtrack.cc
|
||||
re2/testing/dump.cc
|
||||
re2/testing/exhaustive_tester.cc
|
||||
re2/testing/null_walker.cc
|
||||
re2/testing/regexp_generator.cc
|
||||
re2/testing/string_generator.cc
|
||||
re2/testing/tester.cc
|
||||
util/pcre.cc
|
||||
util/random.cc
|
||||
util/${THREADING}.cc
|
||||
)
|
||||
|
||||
add_library(test STATIC ${TEST_LIBRARY_SOURCES} util/test.cc)
|
||||
add_library(benchmark STATIC ${TEST_LIBRARY_SOURCES} util/benchmark.cc)
|
||||
|
||||
set(TEST_TARGETS
|
||||
charclass_test
|
||||
compile_test
|
||||
filtered_re2_test
|
||||
mimics_pcre_test
|
||||
parse_test
|
||||
possible_match_test
|
||||
re2_test
|
||||
re2_arg_test
|
||||
regexp_test
|
||||
required_prefix_test
|
||||
search_test
|
||||
set_test
|
||||
simplify_test
|
||||
string_generator_test
|
||||
|
||||
dfa_test
|
||||
exhaustive1_test
|
||||
exhaustive2_test
|
||||
exhaustive3_test
|
||||
exhaustive_test
|
||||
random_test
|
||||
)
|
||||
|
||||
set(BENCHMARK_TARGETS
|
||||
regexp_benchmark
|
||||
)
|
||||
|
||||
foreach(target ${TEST_TARGETS})
|
||||
add_executable(${target} re2/testing/${target}.cc)
|
||||
target_link_libraries(${target} test re2 ${EXTRA_TARGET_LINK_LIBRARIES})
|
||||
endforeach(target)
|
||||
|
||||
foreach(target ${BENCHMARK_TARGETS})
|
||||
add_executable(${target} re2/testing/${target}.cc)
|
||||
target_link_libraries(${target} benchmark re2 ${EXTRA_TARGET_LINK_LIBRARIES})
|
||||
endforeach(target)
|
2
third_party/re2/CONTRIBUTING.md
vendored
2
third_party/re2/CONTRIBUTING.md
vendored
@ -1,2 +0,0 @@
|
||||
RE2 uses Gerrit instead of GitHub pull requests.
|
||||
See the [Contributing](https://github.com/google/re2/wiki/Contribute) wiki page.
|
41
third_party/re2/CONTRIBUTORS
vendored
41
third_party/re2/CONTRIBUTORS
vendored
@ -1,41 +0,0 @@
|
||||
# This is the official list of people who can contribute
|
||||
# (and typically have contributed) code to the RE2 repository.
|
||||
# The AUTHORS file lists the copyright holders; this file
|
||||
# lists people. For example, Google employees are listed here
|
||||
# but not in AUTHORS, because Google holds the copyright.
|
||||
#
|
||||
# The submission process automatically checks to make sure
|
||||
# that people submitting code are listed in this file (by email address).
|
||||
#
|
||||
# Names should be added to this file only after verifying that
|
||||
# the individual or the individual's organization has agreed to
|
||||
# the appropriate Contributor License Agreement, found here:
|
||||
#
|
||||
# http://code.google.com/legal/individual-cla-v1.0.html
|
||||
# http://code.google.com/legal/corporate-cla-v1.0.html
|
||||
#
|
||||
# The agreement for individuals can be filled out on the web.
|
||||
#
|
||||
# When adding J Random Contributor's name to this file,
|
||||
# either J's name or J's organization's name should be
|
||||
# added to the AUTHORS file, depending on whether the
|
||||
# individual or corporate CLA was used.
|
||||
|
||||
# Names should be added to this file like so:
|
||||
# Name <email address>
|
||||
|
||||
# Please keep the list sorted.
|
||||
|
||||
Dominic Battré <battre@chromium.org>
|
||||
Doug Kwan <dougkwan@google.com>
|
||||
Dmitriy Vyukov <dvyukov@google.com>
|
||||
John Millikin <jmillikin@gmail.com>
|
||||
Mike Nazarewicz <mpn@google.com>
|
||||
Nico Weber <thakis@chromium.org>
|
||||
Pawel Hajdan <phajdan.jr@gmail.com>
|
||||
Rob Pike <r@google.com>
|
||||
Russ Cox <rsc@swtch.com>
|
||||
Sanjay Ghemawat <sanjay@google.com>
|
||||
Stefano Rivera <stefano.rivera@gmail.com>
|
||||
Srinivasan Venkatachary <vsri@google.com>
|
||||
Viatcheslav Ostapenko <sl.ostapenko@samsung.com>
|
310
third_party/re2/Makefile
vendored
310
third_party/re2/Makefile
vendored
@ -1,310 +0,0 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# to build against PCRE for testing or benchmarking,
|
||||
# uncomment the next two lines
|
||||
# CCPCRE=-I/usr/local/include -DUSEPCRE
|
||||
# LDPCRE=-L/usr/local/lib -lpcre
|
||||
|
||||
CXX?=g++
|
||||
CXXFLAGS?=-O3 -g # can override
|
||||
RE2_CXXFLAGS?=-Wall -Wextra -Wno-unused-parameter -Wno-missing-field-initializers -I. $(CCPCRE) # required
|
||||
LDFLAGS?=-pthread
|
||||
AR?=ar
|
||||
ARFLAGS?=rsc
|
||||
NM?=nm
|
||||
NMFLAGS?=-p
|
||||
|
||||
# Variables mandated by GNU, the arbiter of all good taste on the internet.
|
||||
# http://www.gnu.org/prep/standards/standards.html
|
||||
prefix=/usr/local
|
||||
exec_prefix=$(prefix)
|
||||
bindir=$(exec_prefix)/bin
|
||||
includedir=$(prefix)/include
|
||||
libdir=$(exec_prefix)/lib
|
||||
INSTALL=install
|
||||
INSTALL_PROGRAM=$(INSTALL)
|
||||
INSTALL_DATA=$(INSTALL) -m 644
|
||||
|
||||
# ABI version
|
||||
# http://tldp.org/HOWTO/Program-Library-HOWTO/shared-libraries.html
|
||||
SONAME=0
|
||||
|
||||
# To rebuild the Tables generated by Perl and Python scripts (requires Internet
|
||||
# access for Unicode data), uncomment the following line:
|
||||
# REBUILD_TABLES=1
|
||||
|
||||
ifeq ($(shell uname),Darwin)
|
||||
SOEXT=dylib
|
||||
SOEXTVER=$(SONAME).$(SOEXT)
|
||||
SOEXTVER00=$(SONAME).0.0.$(SOEXT)
|
||||
MAKE_SHARED_LIBRARY=$(CXX) -dynamiclib $(LDFLAGS) -Wl,-install_name,@rpath/libre2.$(SOEXTVER) -exported_symbols_list libre2.symbols.darwin
|
||||
else ifeq ($(shell uname),SunOS)
|
||||
SOEXT=so
|
||||
SOEXTVER=$(SOEXT).$(SONAME)
|
||||
SOEXTVER00=$(SOEXT).$(SONAME).0.0
|
||||
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),-M,libre2.symbols $(LDFLAGS)
|
||||
else
|
||||
SOEXT=so
|
||||
SOEXTVER=$(SOEXT).$(SONAME)
|
||||
SOEXTVER00=$(SOEXT).$(SONAME).0.0
|
||||
MAKE_SHARED_LIBRARY=$(CXX) -shared -Wl,-soname,libre2.$(SOEXTVER),--version-script,libre2.symbols $(LDFLAGS)
|
||||
endif
|
||||
|
||||
all: obj/libre2.a obj/so/libre2.$(SOEXT)
|
||||
|
||||
INSTALL_HFILES=\
|
||||
re2/filtered_re2.h\
|
||||
re2/re2.h\
|
||||
re2/set.h\
|
||||
re2/stringpiece.h\
|
||||
re2/variadic_function.h\
|
||||
|
||||
HFILES=\
|
||||
util/atomicops.h\
|
||||
util/benchmark.h\
|
||||
util/flags.h\
|
||||
util/logging.h\
|
||||
util/mutex.h\
|
||||
util/pcre.h\
|
||||
util/random.h\
|
||||
util/sparse_array.h\
|
||||
util/sparse_set.h\
|
||||
util/test.h\
|
||||
util/thread.h\
|
||||
util/utf.h\
|
||||
util/util.h\
|
||||
util/valgrind.h\
|
||||
re2/filtered_re2.h\
|
||||
re2/prefilter.h\
|
||||
re2/prefilter_tree.h\
|
||||
re2/prog.h\
|
||||
re2/re2.h\
|
||||
re2/regexp.h\
|
||||
re2/set.h\
|
||||
re2/stringpiece.h\
|
||||
re2/testing/exhaustive_tester.h\
|
||||
re2/testing/regexp_generator.h\
|
||||
re2/testing/string_generator.h\
|
||||
re2/testing/tester.h\
|
||||
re2/unicode_casefold.h\
|
||||
re2/unicode_groups.h\
|
||||
re2/variadic_function.h\
|
||||
re2/walker-inl.h\
|
||||
|
||||
OFILES=\
|
||||
obj/util/hash.o\
|
||||
obj/util/logging.o\
|
||||
obj/util/rune.o\
|
||||
obj/util/stringprintf.o\
|
||||
obj/util/strutil.o\
|
||||
obj/util/valgrind.o\
|
||||
obj/re2/bitstate.o\
|
||||
obj/re2/compile.o\
|
||||
obj/re2/dfa.o\
|
||||
obj/re2/filtered_re2.o\
|
||||
obj/re2/mimics_pcre.o\
|
||||
obj/re2/nfa.o\
|
||||
obj/re2/onepass.o\
|
||||
obj/re2/parse.o\
|
||||
obj/re2/perl_groups.o\
|
||||
obj/re2/prefilter.o\
|
||||
obj/re2/prefilter_tree.o\
|
||||
obj/re2/prog.o\
|
||||
obj/re2/re2.o\
|
||||
obj/re2/regexp.o\
|
||||
obj/re2/set.o\
|
||||
obj/re2/simplify.o\
|
||||
obj/re2/stringpiece.o\
|
||||
obj/re2/tostring.o\
|
||||
obj/re2/unicode_casefold.o\
|
||||
obj/re2/unicode_groups.o\
|
||||
|
||||
TESTOFILES=\
|
||||
obj/util/pcre.o\
|
||||
obj/util/random.o\
|
||||
obj/util/thread.o\
|
||||
obj/re2/testing/backtrack.o\
|
||||
obj/re2/testing/dump.o\
|
||||
obj/re2/testing/exhaustive_tester.o\
|
||||
obj/re2/testing/null_walker.o\
|
||||
obj/re2/testing/regexp_generator.o\
|
||||
obj/re2/testing/string_generator.o\
|
||||
obj/re2/testing/tester.o\
|
||||
|
||||
TESTS=\
|
||||
obj/test/charclass_test\
|
||||
obj/test/compile_test\
|
||||
obj/test/filtered_re2_test\
|
||||
obj/test/mimics_pcre_test\
|
||||
obj/test/parse_test\
|
||||
obj/test/possible_match_test\
|
||||
obj/test/re2_test\
|
||||
obj/test/re2_arg_test\
|
||||
obj/test/regexp_test\
|
||||
obj/test/required_prefix_test\
|
||||
obj/test/search_test\
|
||||
obj/test/set_test\
|
||||
obj/test/simplify_test\
|
||||
obj/test/string_generator_test\
|
||||
|
||||
BIGTESTS=\
|
||||
obj/test/dfa_test\
|
||||
obj/test/exhaustive1_test\
|
||||
obj/test/exhaustive2_test\
|
||||
obj/test/exhaustive3_test\
|
||||
obj/test/exhaustive_test\
|
||||
obj/test/random_test\
|
||||
|
||||
SOFILES=$(patsubst obj/%,obj/so/%,$(OFILES))
|
||||
STESTOFILES=$(patsubst obj/%,obj/so/%,$(TESTOFILES))
|
||||
STESTS=$(patsubst obj/%,obj/so/%,$(TESTS))
|
||||
SBIGTESTS=$(patsubst obj/%,obj/so/%,$(BIGTESTS))
|
||||
|
||||
DOFILES=$(patsubst obj/%,obj/dbg/%,$(OFILES))
|
||||
DTESTOFILES=$(patsubst obj/%,obj/dbg/%,$(TESTOFILES))
|
||||
DTESTS=$(patsubst obj/%,obj/dbg/%,$(TESTS))
|
||||
DBIGTESTS=$(patsubst obj/%,obj/dbg/%,$(BIGTESTS))
|
||||
|
||||
obj/%.o: %.cc $(HFILES)
|
||||
@mkdir -p $$(dirname $@)
|
||||
$(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
|
||||
|
||||
obj/dbg/%.o: %.cc $(HFILES)
|
||||
@mkdir -p $$(dirname $@)
|
||||
$(CXX) -c -o $@ $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) $*.cc
|
||||
|
||||
obj/so/%.o: %.cc $(HFILES)
|
||||
@mkdir -p $$(dirname $@)
|
||||
$(CXX) -c -o $@ -fPIC $(CPPFLAGS) $(CXXFLAGS) $(RE2_CXXFLAGS) -DNDEBUG $*.cc
|
||||
|
||||
obj/libre2.a: $(OFILES)
|
||||
@mkdir -p obj
|
||||
$(AR) $(ARFLAGS) obj/libre2.a $(OFILES)
|
||||
|
||||
obj/dbg/libre2.a: $(DOFILES)
|
||||
@mkdir -p obj/dbg
|
||||
$(AR) $(ARFLAGS) obj/dbg/libre2.a $(DOFILES)
|
||||
|
||||
obj/so/libre2.$(SOEXT): $(SOFILES)
|
||||
@mkdir -p obj/so
|
||||
$(MAKE_SHARED_LIBRARY) -o obj/so/libre2.$(SOEXTVER) $(SOFILES)
|
||||
ln -sf libre2.$(SOEXTVER) $@
|
||||
|
||||
obj/test/%: obj/libre2.a obj/re2/testing/%.o $(TESTOFILES) obj/util/test.o
|
||||
@mkdir -p obj/test
|
||||
$(CXX) -o $@ obj/re2/testing/$*.o $(TESTOFILES) obj/util/test.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
obj/dbg/test/%: obj/dbg/libre2.a obj/dbg/re2/testing/%.o $(DTESTOFILES) obj/dbg/util/test.o
|
||||
@mkdir -p obj/dbg/test
|
||||
$(CXX) -o $@ obj/dbg/re2/testing/$*.o $(DTESTOFILES) obj/dbg/util/test.o obj/dbg/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
obj/so/test/%: obj/so/libre2.$(SOEXT) obj/libre2.a obj/so/re2/testing/%.o $(STESTOFILES) obj/so/util/test.o
|
||||
@mkdir -p obj/so/test
|
||||
$(CXX) -o $@ obj/so/re2/testing/$*.o $(STESTOFILES) obj/so/util/test.o -Lobj/so -lre2 obj/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
obj/test/regexp_benchmark: obj/libre2.a obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o
|
||||
@mkdir -p obj/test
|
||||
$(CXX) -o $@ obj/re2/testing/regexp_benchmark.o $(TESTOFILES) obj/util/benchmark.o obj/libre2.a $(LDFLAGS) $(LDPCRE)
|
||||
|
||||
ifdef REBUILD_TABLES
|
||||
re2/perl_groups.cc: re2/make_perl_groups.pl
|
||||
perl $< > $@
|
||||
|
||||
re2/unicode_%.cc: re2/make_unicode_%.py
|
||||
python $< > $@
|
||||
|
||||
.PRECIOUS: re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
|
||||
endif
|
||||
|
||||
distclean: clean
|
||||
rm -f re2/perl_groups.cc re2/unicode_casefold.cc re2/unicode_groups.cc
|
||||
|
||||
clean:
|
||||
rm -rf obj
|
||||
rm -f re2/*.pyc
|
||||
|
||||
testofiles: $(TESTOFILES)
|
||||
|
||||
test: $(DTESTS) $(TESTS) $(STESTS) debug-test static-test shared-test
|
||||
|
||||
debug-test: $(DTESTS)
|
||||
@echo
|
||||
@echo Running debug binary tests.
|
||||
@echo
|
||||
@./runtests $(DTESTS)
|
||||
|
||||
static-test: $(TESTS)
|
||||
@echo
|
||||
@echo Running static binary tests.
|
||||
@echo
|
||||
@./runtests $(TESTS)
|
||||
|
||||
shared-test: $(STESTS)
|
||||
@echo
|
||||
@echo Running dynamic binary tests.
|
||||
@echo
|
||||
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS)
|
||||
|
||||
debug-bigtest: $(DTESTS) $(DBIGTESTS)
|
||||
@./runtests $(DTESTS) $(DBIGTESTS)
|
||||
|
||||
static-bigtest: $(TESTS) $(BIGTESTS)
|
||||
@./runtests $(TESTS) $(BIGTESTS)
|
||||
|
||||
shared-bigtest: $(STESTS) $(SBIGTESTS)
|
||||
@LD_LIBRARY_PATH=obj/so:$(LD_LIBRARY_PATH) ./runtests $(STESTS) $(SBIGTESTS)
|
||||
|
||||
benchmark: obj/test/regexp_benchmark
|
||||
|
||||
install: obj/libre2.a obj/so/libre2.$(SOEXT)
|
||||
mkdir -p $(DESTDIR)$(includedir)/re2 $(DESTDIR)$(libdir)/pkgconfig
|
||||
$(INSTALL_DATA) $(INSTALL_HFILES) $(DESTDIR)$(includedir)/re2
|
||||
$(INSTALL) obj/libre2.a $(DESTDIR)$(libdir)/libre2.a
|
||||
$(INSTALL) obj/so/libre2.$(SOEXT) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER00)
|
||||
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXTVER)
|
||||
ln -sf libre2.$(SOEXTVER00) $(DESTDIR)$(libdir)/libre2.$(SOEXT)
|
||||
sed -e "s#@prefix@#${prefix}#" re2.pc >$(DESTDIR)$(libdir)/pkgconfig/re2.pc
|
||||
|
||||
testinstall:
|
||||
@mkdir -p obj
|
||||
cp testinstall.cc obj
|
||||
ifneq ($(shell uname),Darwin)
|
||||
(cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -static -o testinstall)
|
||||
obj/testinstall
|
||||
endif
|
||||
(cd obj && $(CXX) -I$(DESTDIR)$(includedir) -L$(DESTDIR)$(libdir) testinstall.cc -lre2 -pthread -o testinstall)
|
||||
LD_LIBRARY_PATH=$(DESTDIR)$(libdir) obj/testinstall
|
||||
|
||||
benchlog: obj/test/regexp_benchmark
|
||||
(echo '==BENCHMARK==' `hostname` `date`; \
|
||||
(uname -a; $(CXX) --version; git rev-parse --short HEAD; file obj/test/regexp_benchmark) | sed 's/^/# /'; \
|
||||
echo; \
|
||||
./obj/test/regexp_benchmark 'PCRE|RE2') | tee -a benchlog.$$(hostname | sed 's/\..*//')
|
||||
|
||||
# Keep gmake from deleting intermediate files it creates.
|
||||
# This makes repeated builds faster and preserves debug info on OS X.
|
||||
|
||||
.PRECIOUS: obj/%.o obj/dbg/%.o obj/so/%.o obj/libre2.a \
|
||||
obj/dbg/libre2.a obj/so/libre2.a \
|
||||
obj/test/% obj/so/test/% obj/dbg/test/%
|
||||
|
||||
log:
|
||||
$(MAKE) clean
|
||||
$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" \
|
||||
$(filter obj/test/exhaustive%_test,$(BIGTESTS))
|
||||
echo '#' RE2 exhaustive tests built by make log >re2-exhaustive.txt
|
||||
echo '#' $$(date) >>re2-exhaustive.txt
|
||||
obj/test/exhaustive_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
obj/test/exhaustive1_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
obj/test/exhaustive2_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
obj/test/exhaustive3_test |grep -v '^PASS$$' >>re2-exhaustive.txt
|
||||
|
||||
$(MAKE) CXXFLAGS="$(CXXFLAGS) -DLOGGING=1" obj/test/search_test
|
||||
echo '#' RE2 basic search tests built by make $@ >re2-search.txt
|
||||
echo '#' $$(date) >>re2-search.txt
|
||||
obj/test/search_test |grep -v '^PASS$$' >>re2-search.txt
|
||||
|
||||
x: x.cc obj/libre2.a
|
||||
g++ -I. -o x x.cc obj/libre2.a
|
32
third_party/re2/README
vendored
32
third_party/re2/README
vendored
@ -1,32 +0,0 @@
|
||||
This is the source code repository for RE2, a regular expression library.
|
||||
|
||||
For documentation about how to install and use RE2,
|
||||
visit https://github.com/google/re2/.
|
||||
|
||||
The short version is:
|
||||
|
||||
make
|
||||
make test
|
||||
make install
|
||||
make testinstall
|
||||
|
||||
More information can be found on the wiki:
|
||||
https://github.com/google/re2/wiki
|
||||
|
||||
Issue tracker:
|
||||
https://github.com/google/re2/issues
|
||||
|
||||
Mailing list:
|
||||
https://groups.google.com/group/re2-dev
|
||||
|
||||
Unless otherwise noted, the RE2 source files are distributed
|
||||
under the BSD-style license found in the LICENSE file.
|
||||
|
||||
RE2's native language is C++.
|
||||
An Erlang wrapper is at https://github.com/tuncer/re2/.
|
||||
An Inferno wrapper is at https://github.com/powerman/inferno-re2/.
|
||||
A Node.js wrapper is at https://github.com/uhop/node-re2/ and on NPM.
|
||||
An OCaml wrapper is at https://github.com/janestreet/re2/ and on OPAM.
|
||||
A Perl wrapper is at https://github.com/dgl/re-engine-RE2/ and on CPAN.
|
||||
A Python wrapper is at https://github.com/facebook/pyre2/.
|
||||
A Ruby wrapper is at https://github.com/axic/rre2/.
|
5
third_party/re2/WORKSPACE
vendored
5
third_party/re2/WORKSPACE
vendored
@ -1,5 +0,0 @@
|
||||
# Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Bazel (http://bazel.io/) WORKSPACE file for RE2.
|
2211
third_party/re2/benchlog/benchlog.c2
vendored
2211
third_party/re2/benchlog/benchlog.c2
vendored
File diff suppressed because it is too large
Load Diff
582
third_party/re2/benchlog/benchlog.mini
vendored
582
third_party/re2/benchlog/benchlog.mini
vendored
@ -1,582 +0,0 @@
|
||||
hw.ncpu: 2
|
||||
hw.byteorder: 1234
|
||||
hw.memsize: 4294967296
|
||||
hw.activecpu: 2
|
||||
hw.physicalcpu: 2
|
||||
hw.physicalcpu_max: 2
|
||||
hw.logicalcpu: 2
|
||||
hw.logicalcpu_max: 2
|
||||
hw.cputype: 7
|
||||
hw.cpusubtype: 4
|
||||
hw.cpu64bit_capable: 1
|
||||
hw.cpufamily: 1114597871
|
||||
hw.cacheconfig: 2 1 2 0 0 0 0 0 0 0
|
||||
hw.cachesize: 3221225472 32768 2097152 0 0 0 0 0 0 0
|
||||
hw.pagesize: 4096
|
||||
hw.busfrequency: 664000000
|
||||
hw.busfrequency_min: 664000000
|
||||
hw.busfrequency_max: 664000000
|
||||
hw.cpufrequency: 1830000000
|
||||
hw.cpufrequency_min: 1830000000
|
||||
hw.cpufrequency_max: 1830000000
|
||||
hw.cachelinesize: 64
|
||||
hw.l1icachesize: 32768
|
||||
hw.l1dcachesize: 32768
|
||||
hw.l2cachesize: 2097152
|
||||
hw.tbfrequency: 1000000000
|
||||
hw.packages: 1
|
||||
hw.optional.floatingpoint: 1
|
||||
hw.optional.mmx: 1
|
||||
hw.optional.sse: 1
|
||||
hw.optional.sse2: 1
|
||||
hw.optional.sse3: 1
|
||||
hw.optional.supplementalsse3: 1
|
||||
hw.optional.sse4_1: 0
|
||||
hw.optional.sse4_2: 0
|
||||
hw.optional.x86_64: 1
|
||||
hw.machine = i386
|
||||
hw.model = Macmini2,1
|
||||
hw.ncpu = 2
|
||||
hw.byteorder = 1234
|
||||
hw.physmem = 2147483648
|
||||
hw.usermem = 1849147392
|
||||
hw.pagesize = 4096
|
||||
hw.epoch = 0
|
||||
hw.vectorunit = 1
|
||||
hw.busfrequency = 664000000
|
||||
hw.cpufrequency = 1830000000
|
||||
hw.cachelinesize = 64
|
||||
hw.l1icachesize = 32768
|
||||
hw.l1dcachesize = 32768
|
||||
hw.l2settings = 1
|
||||
hw.l2cachesize = 2097152
|
||||
hw.tbfrequency = 1000000000
|
||||
hw.memsize = 4294967296
|
||||
hw.availcpu = 2
|
||||
|
||||
machdep.cpu.max_basic: 10
|
||||
machdep.cpu.max_ext: 2147483656
|
||||
machdep.cpu.vendor: GenuineIntel
|
||||
machdep.cpu.brand_string: Intel(R) Core(TM)2 CPU T5600 @ 1.83GHz
|
||||
machdep.cpu.family: 6
|
||||
machdep.cpu.model: 15
|
||||
machdep.cpu.extmodel: 0
|
||||
machdep.cpu.extfamily: 0
|
||||
machdep.cpu.stepping: 2
|
||||
machdep.cpu.feature_bits: 3219913727 58301
|
||||
machdep.cpu.extfeature_bits: 537921536 1
|
||||
machdep.cpu.signature: 1778
|
||||
machdep.cpu.brand: 0
|
||||
machdep.cpu.features: FPU VME DE PSE TSC MSR PAE MCE CX8 APIC SEP MTRR PGE MCA CMOV PAT PSE36 CLFSH DS ACPI MMX FXSR SSE SSE2 SS HTT TM SSE3 MON DSCPL VMX EST TM2 SSSE3 CX16 TPR PDCM
|
||||
machdep.cpu.extfeatures: SYSCALL XD EM64T
|
||||
machdep.cpu.logical_per_package: 2
|
||||
machdep.cpu.cores_per_package: 2
|
||||
machdep.cpu.microcode_version: 87
|
||||
machdep.cpu.mwait.linesize_min: 64
|
||||
machdep.cpu.mwait.linesize_max: 64
|
||||
machdep.cpu.mwait.extensions: 3
|
||||
machdep.cpu.mwait.sub_Cstates: 139808
|
||||
machdep.cpu.thermal.sensor: 1
|
||||
machdep.cpu.thermal.dynamic_acceleration: 0
|
||||
machdep.cpu.thermal.thresholds: 2
|
||||
machdep.cpu.thermal.ACNT_MCNT: 1
|
||||
machdep.cpu.arch_perf.version: 2
|
||||
machdep.cpu.arch_perf.number: 2
|
||||
machdep.cpu.arch_perf.width: 40
|
||||
machdep.cpu.arch_perf.events_number: 7
|
||||
machdep.cpu.arch_perf.events: 0
|
||||
machdep.cpu.arch_perf.fixed_number: 0
|
||||
machdep.cpu.arch_perf.fixed_width: 0
|
||||
machdep.cpu.cache.linesize: 64
|
||||
machdep.cpu.cache.L2_associativity: 6
|
||||
machdep.cpu.cache.size: 2048
|
||||
machdep.cpu.tlb.inst.small: 128
|
||||
machdep.cpu.tlb.inst.large: 8
|
||||
machdep.cpu.tlb.data.small: 16
|
||||
machdep.cpu.tlb.data.small_level1: 256
|
||||
machdep.cpu.tlb.data.large: 16
|
||||
machdep.cpu.tlb.data.large_level1: 32
|
||||
machdep.cpu.address_bits.physical: 36
|
||||
machdep.cpu.address_bits.virtual: 48
|
||||
machdep.cpu.core_count: 2
|
||||
machdep.cpu.thread_count: 2
|
||||
|
||||
|
||||
==BENCHMARK== mini.local Fri Feb 26 16:57:10 PST 2010
|
||||
# Darwin mini.local 10.2.0 Darwin Kernel Version 10.2.0: Tue Nov 3 10:37:10 PST 2009; root:xnu-1486.2.11~1/RELEASE_I386 i386
|
||||
# i686-apple-darwin10-g++-4.2.1 (GCC) 4.2.1 (Apple Inc. build 5646) (dot 1)
|
||||
# Copyright (C) 2007 Free Software Foundation, Inc.
|
||||
# This is free software; see the source for copying conditions. There is NO
|
||||
# warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
#
|
||||
# a94585d91e66+ tip
|
||||
# obj/test/regexp_benchmark: Mach-O 64-bit executable x86_64
|
||||
|
||||
Search_Easy0_CachedPCRE/8 10000000 176 ns/op 45.40 MB/s
|
||||
Search_Easy0_CachedPCRE/16 10000000 209 ns/op 76.41 MB/s
|
||||
Search_Easy0_CachedPCRE/32 10000000 269 ns/op 118.53 MB/s
|
||||
Search_Easy0_CachedPCRE/64 5000000 398 ns/op 160.77 MB/s
|
||||
Search_Easy0_CachedPCRE/128 5000000 536 ns/op 238.69 MB/s
|
||||
Search_Easy0_CachedPCRE/256 2000000 897 ns/op 285.22 MB/s
|
||||
Search_Easy0_CachedPCRE/512 1000000 2161 ns/op 236.92 MB/s
|
||||
Search_Easy0_CachedPCRE/1K 500000 4769 ns/op 214.70 MB/s
|
||||
Search_Easy0_CachedPCRE/2K 200000 8031 ns/op 255.00 MB/s
|
||||
Search_Easy0_CachedPCRE/4K 100000 16208 ns/op 252.71 MB/s
|
||||
Search_Easy0_CachedPCRE/8K 50000 32219 ns/op 254.26 MB/s
|
||||
Search_Easy0_CachedPCRE/16K 50000 63347 ns/op 258.64 MB/s
|
||||
Search_Easy0_CachedPCRE/32K 10000 125875 ns/op 260.32 MB/s
|
||||
Search_Easy0_CachedPCRE/64K 10000 247829 ns/op 264.44 MB/s
|
||||
Search_Easy0_CachedPCRE/128K 5000 498699 ns/op 262.83 MB/s
|
||||
Search_Easy0_CachedPCRE/256K 2000 978021 ns/op 268.04 MB/s
|
||||
Search_Easy0_CachedPCRE/512K 1000 1975059 ns/op 265.45 MB/s
|
||||
Search_Easy0_CachedPCRE/1M 500 3994258 ns/op 262.52 MB/s
|
||||
Search_Easy0_CachedPCRE/2M 200 7959640 ns/op 263.47 MB/s
|
||||
Search_Easy0_CachedPCRE/4M 100 15950300 ns/op 262.96 MB/s
|
||||
Search_Easy0_CachedPCRE/8M 50 32435540 ns/op 258.62 MB/s
|
||||
Search_Easy0_CachedPCRE/16M 50 64686180 ns/op 259.36 MB/s
|
||||
Search_Easy0_CachedRE2/8 5000000 535 ns/op 14.95 MB/s
|
||||
Search_Easy0_CachedRE2/16 5000000 557 ns/op 28.70 MB/s
|
||||
Search_Easy0_CachedRE2/32 5000000 595 ns/op 53.75 MB/s
|
||||
Search_Easy0_CachedRE2/64 5000000 643 ns/op 99.50 MB/s
|
||||
Search_Easy0_CachedRE2/128 2000000 759 ns/op 168.64 MB/s
|
||||
Search_Easy0_CachedRE2/256 2000000 972 ns/op 263.30 MB/s
|
||||
Search_Easy0_CachedRE2/512 1000000 1458 ns/op 351.13 MB/s
|
||||
Search_Easy0_CachedRE2/1K 1000000 2544 ns/op 402.51 MB/s
|
||||
Search_Easy0_CachedRE2/2K 500000 4551 ns/op 449.99 MB/s
|
||||
Search_Easy0_CachedRE2/4K 200000 8677 ns/op 472.01 MB/s
|
||||
Search_Easy0_CachedRE2/8K 100000 17188 ns/op 476.59 MB/s
|
||||
Search_Easy0_CachedRE2/16K 50000 33869 ns/op 483.73 MB/s
|
||||
Search_Easy0_CachedRE2/32K 50000 67787 ns/op 483.39 MB/s
|
||||
Search_Easy0_CachedRE2/64K 10000 133362 ns/op 491.41 MB/s
|
||||
Search_Easy0_CachedRE2/128K 10000 266469 ns/op 491.88 MB/s
|
||||
Search_Easy0_CachedRE2/256K 5000 536980 ns/op 488.18 MB/s
|
||||
Search_Easy0_CachedRE2/512K 2000 1050843 ns/op 498.92 MB/s
|
||||
Search_Easy0_CachedRE2/1M 1000 2120649 ns/op 494.46 MB/s
|
||||
Search_Easy0_CachedRE2/2M 500 4273918 ns/op 490.69 MB/s
|
||||
Search_Easy0_CachedRE2/4M 200 8591285 ns/op 488.20 MB/s
|
||||
Search_Easy0_CachedRE2/8M 100 17197390 ns/op 487.78 MB/s
|
||||
Search_Easy0_CachedRE2/16M 50 34338780 ns/op 488.58 MB/s
|
||||
Search_Easy1_CachedPCRE/8 10000000 174 ns/op 45.74 MB/s
|
||||
Search_Easy1_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
|
||||
Search_Easy1_CachedPCRE/32 10000000 270 ns/op 118.43 MB/s
|
||||
Search_Easy1_CachedPCRE/64 5000000 402 ns/op 159.07 MB/s
|
||||
Search_Easy1_CachedPCRE/128 5000000 540 ns/op 236.84 MB/s
|
||||
Search_Easy1_CachedPCRE/256 2000000 909 ns/op 281.34 MB/s
|
||||
Search_Easy1_CachedPCRE/512 1000000 1852 ns/op 276.34 MB/s
|
||||
Search_Easy1_CachedPCRE/1K 500000 4318 ns/op 237.12 MB/s
|
||||
Search_Easy1_CachedPCRE/2K 200000 8346 ns/op 245.37 MB/s
|
||||
Search_Easy1_CachedPCRE/4K 100000 16214 ns/op 252.62 MB/s
|
||||
Search_Easy1_CachedPCRE/8K 50000 32438 ns/op 252.54 MB/s
|
||||
Search_Easy1_CachedPCRE/16K 50000 62914 ns/op 260.42 MB/s
|
||||
Search_Easy1_CachedPCRE/32K 10000 124792 ns/op 262.58 MB/s
|
||||
Search_Easy1_CachedPCRE/64K 10000 250941 ns/op 261.16 MB/s
|
||||
Search_Easy1_CachedPCRE/128K 5000 498405 ns/op 262.98 MB/s
|
||||
Search_Easy1_CachedPCRE/256K 2000 997305 ns/op 262.85 MB/s
|
||||
Search_Easy1_CachedPCRE/512K 1000 2023179 ns/op 259.14 MB/s
|
||||
Search_Easy1_CachedPCRE/1M 500 4005202 ns/op 261.80 MB/s
|
||||
Search_Easy1_CachedPCRE/2M 200 8116410 ns/op 258.38 MB/s
|
||||
Search_Easy1_CachedPCRE/4M 100 16145970 ns/op 259.77 MB/s
|
||||
Search_Easy1_CachedPCRE/8M 50 32471260 ns/op 258.34 MB/s
|
||||
Search_Easy1_CachedPCRE/16M 50 64734020 ns/op 259.17 MB/s
|
||||
Search_Easy1_CachedRE2/8 5000000 543 ns/op 14.72 MB/s
|
||||
Search_Easy1_CachedRE2/16 5000000 570 ns/op 28.07 MB/s
|
||||
Search_Easy1_CachedRE2/32 5000000 605 ns/op 52.81 MB/s
|
||||
Search_Easy1_CachedRE2/64 5000000 643 ns/op 99.39 MB/s
|
||||
Search_Easy1_CachedRE2/128 2000000 764 ns/op 167.45 MB/s
|
||||
Search_Easy1_CachedRE2/256 2000000 970 ns/op 263.85 MB/s
|
||||
Search_Easy1_CachedRE2/512 1000000 1455 ns/op 351.75 MB/s
|
||||
Search_Easy1_CachedRE2/1K 1000000 2506 ns/op 408.48 MB/s
|
||||
Search_Easy1_CachedRE2/2K 500000 4571 ns/op 447.97 MB/s
|
||||
Search_Easy1_CachedRE2/4K 200000 8812 ns/op 464.81 MB/s
|
||||
Search_Easy1_CachedRE2/8K 100000 17079 ns/op 479.65 MB/s
|
||||
Search_Easy1_CachedRE2/16K 50000 33802 ns/op 484.70 MB/s
|
||||
Search_Easy1_CachedRE2/32K 50000 67171 ns/op 487.83 MB/s
|
||||
Search_Easy1_CachedRE2/64K 10000 131505 ns/op 498.35 MB/s
|
||||
Search_Easy1_CachedRE2/128K 10000 263228 ns/op 497.94 MB/s
|
||||
Search_Easy1_CachedRE2/256K 5000 528135 ns/op 496.36 MB/s
|
||||
Search_Easy1_CachedRE2/512K 2000 1052768 ns/op 498.01 MB/s
|
||||
Search_Easy1_CachedRE2/1M 1000 2112714 ns/op 496.32 MB/s
|
||||
Search_Easy1_CachedRE2/2M 500 4289478 ns/op 488.91 MB/s
|
||||
Search_Easy1_CachedRE2/4M 200 8519430 ns/op 492.32 MB/s
|
||||
Search_Easy1_CachedRE2/8M 100 17002860 ns/op 493.36 MB/s
|
||||
Search_Easy1_CachedRE2/16M 50 34341100 ns/op 488.55 MB/s
|
||||
Search_Medium_CachedPCRE/8 10000000 175 ns/op 45.48 MB/s
|
||||
Search_Medium_CachedPCRE/16 10000000 206 ns/op 77.31 MB/s
|
||||
Search_Medium_CachedPCRE/32 10000000 273 ns/op 117.10 MB/s
|
||||
Search_Medium_CachedPCRE/64 5000000 427 ns/op 149.60 MB/s
|
||||
Search_Medium_CachedPCRE/128 200000 9382 ns/op 13.64 MB/s
|
||||
Search_Medium_CachedPCRE/256 100000 15339 ns/op 16.69 MB/s
|
||||
Search_Medium_CachedPCRE/512 50000 35837 ns/op 14.29 MB/s
|
||||
Search_Medium_CachedPCRE/1K 50000 71109 ns/op 14.40 MB/s
|
||||
Search_Medium_CachedPCRE/2K 10000 111371 ns/op 18.39 MB/s
|
||||
Search_Medium_CachedPCRE/4K 10000 264964 ns/op 15.46 MB/s
|
||||
Search_Medium_CachedPCRE/8K 5000 554964 ns/op 14.76 MB/s
|
||||
Search_Medium_CachedPCRE/16K 2000 1122116 ns/op 14.60 MB/s
|
||||
Search_Medium_CachedPCRE/32K 1000 2305129 ns/op 14.22 MB/s
|
||||
Search_Medium_CachedPCRE/64K 500 4401888 ns/op 14.89 MB/s
|
||||
Search_Medium_CachedPCRE/128K 200 8591800 ns/op 15.26 MB/s
|
||||
Search_Medium_CachedPCRE/256K 100 17534580 ns/op 14.95 MB/s
|
||||
Search_Medium_CachedRE2/8 5000000 531 ns/op 15.06 MB/s
|
||||
Search_Medium_CachedRE2/16 5000000 579 ns/op 27.60 MB/s
|
||||
Search_Medium_CachedRE2/32 5000000 666 ns/op 47.99 MB/s
|
||||
Search_Medium_CachedRE2/64 2000000 817 ns/op 78.31 MB/s
|
||||
Search_Medium_CachedRE2/128 1000000 1174 ns/op 108.94 MB/s
|
||||
Search_Medium_CachedRE2/256 1000000 1824 ns/op 140.30 MB/s
|
||||
Search_Medium_CachedRE2/512 500000 3097 ns/op 165.29 MB/s
|
||||
Search_Medium_CachedRE2/1K 500000 6101 ns/op 167.84 MB/s
|
||||
Search_Medium_CachedRE2/2K 200000 12024 ns/op 170.32 MB/s
|
||||
Search_Medium_CachedRE2/4K 100000 21483 ns/op 190.66 MB/s
|
||||
Search_Medium_CachedRE2/8K 50000 41321 ns/op 198.25 MB/s
|
||||
Search_Medium_CachedRE2/16K 20000 82227 ns/op 199.25 MB/s
|
||||
Search_Medium_CachedRE2/32K 10000 166314 ns/op 197.02 MB/s
|
||||
Search_Medium_CachedRE2/64K 5000 334190 ns/op 196.10 MB/s
|
||||
Search_Medium_CachedRE2/128K 5000 672222 ns/op 194.98 MB/s
|
||||
Search_Medium_CachedRE2/256K 2000 1335691 ns/op 196.26 MB/s
|
||||
Search_Medium_CachedRE2/512K 1000 2650973 ns/op 197.77 MB/s
|
||||
Search_Medium_CachedRE2/1M 500 5401168 ns/op 194.14 MB/s
|
||||
Search_Medium_CachedRE2/2M 100 10724160 ns/op 195.55 MB/s
|
||||
Search_Medium_CachedRE2/4M 100 21647840 ns/op 193.75 MB/s
|
||||
Search_Medium_CachedRE2/8M 50 43369000 ns/op 193.42 MB/s
|
||||
Search_Medium_CachedRE2/16M 20 85095750 ns/op 197.16 MB/s
|
||||
Search_Hard_CachedPCRE/8 10000000 178 ns/op 44.77 MB/s
|
||||
Search_Hard_CachedPCRE/16 10000000 211 ns/op 75.54 MB/s
|
||||
Search_Hard_CachedPCRE/32 10000000 274 ns/op 116.75 MB/s
|
||||
Search_Hard_CachedPCRE/64 5000000 401 ns/op 159.58 MB/s
|
||||
Search_Hard_CachedPCRE/128 5000 331833 ns/op 0.39 MB/s
|
||||
Search_Hard_CachedPCRE/256 2000 1299658 ns/op 0.20 MB/s
|
||||
Search_Hard_CachedPCRE/512 500 5361070 ns/op 0.10 MB/s
|
||||
Search_Hard_CachedPCRE/1K 100 20744900 ns/op 0.05 MB/s
|
||||
Search_Hard_CachedPCRE/2K 20 78382950 ns/op 0.03 MB/s
|
||||
Search_Hard_CachedPCRE/4K 5 335826800 ns/op 0.01 MB/s
|
||||
Search_Hard_CachedRE2/8 5000000 550 ns/op 14.53 MB/s
|
||||
Search_Hard_CachedRE2/16 5000000 600 ns/op 26.66 MB/s
|
||||
Search_Hard_CachedRE2/32 5000000 683 ns/op 46.80 MB/s
|
||||
Search_Hard_CachedRE2/64 2000000 834 ns/op 76.69 MB/s
|
||||
Search_Hard_CachedRE2/128 1000000 1168 ns/op 109.57 MB/s
|
||||
Search_Hard_CachedRE2/256 1000000 1833 ns/op 139.65 MB/s
|
||||
Search_Hard_CachedRE2/512 500000 3069 ns/op 166.81 MB/s
|
||||
Search_Hard_CachedRE2/1K 500000 5780 ns/op 177.14 MB/s
|
||||
Search_Hard_CachedRE2/2K 200000 11060 ns/op 185.17 MB/s
|
||||
Search_Hard_CachedRE2/4K 100000 21511 ns/op 190.41 MB/s
|
||||
Search_Hard_CachedRE2/8K 50000 41962 ns/op 195.22 MB/s
|
||||
Search_Hard_CachedRE2/16K 20000 82460 ns/op 198.69 MB/s
|
||||
Search_Hard_CachedRE2/32K 10000 164209 ns/op 199.55 MB/s
|
||||
Search_Hard_CachedRE2/64K 5000 326354 ns/op 200.81 MB/s
|
||||
Search_Hard_CachedRE2/128K 5000 659142 ns/op 198.85 MB/s
|
||||
Search_Hard_CachedRE2/256K 2000 1333642 ns/op 196.56 MB/s
|
||||
Search_Hard_CachedRE2/512K 1000 2687422 ns/op 195.09 MB/s
|
||||
Search_Hard_CachedRE2/1M 500 5351592 ns/op 195.94 MB/s
|
||||
Search_Hard_CachedRE2/2M 100 10581690 ns/op 198.19 MB/s
|
||||
Search_Hard_CachedRE2/4M 100 21324320 ns/op 196.69 MB/s
|
||||
Search_Hard_CachedRE2/8M 50 41892520 ns/op 200.24 MB/s
|
||||
Search_Hard_CachedRE2/16M 20 85475700 ns/op 196.28 MB/s
|
||||
Search_Parens_CachedPCRE/8 10000000 298 ns/op 26.80 MB/s
|
||||
Search_Parens_CachedRE2/8 5000000 562 ns/op 14.21 MB/s
|
||||
Search_Parens_CachedRE2/16 5000000 598 ns/op 26.71 MB/s
|
||||
Search_Parens_CachedRE2/32 5000000 676 ns/op 47.27 MB/s
|
||||
Search_Parens_CachedRE2/64 2000000 828 ns/op 77.21 MB/s
|
||||
Search_Parens_CachedRE2/128 1000000 1155 ns/op 110.73 MB/s
|
||||
Search_Parens_CachedRE2/256 1000000 1788 ns/op 143.13 MB/s
|
||||
Search_Parens_CachedRE2/512 500000 3064 ns/op 167.09 MB/s
|
||||
Search_Parens_CachedRE2/1K 500000 5698 ns/op 179.69 MB/s
|
||||
Search_Parens_CachedRE2/2K 200000 10961 ns/op 186.84 MB/s
|
||||
Search_Parens_CachedRE2/4K 100000 21527 ns/op 190.27 MB/s
|
||||
Search_Parens_CachedRE2/8K 50000 41923 ns/op 195.40 MB/s
|
||||
Search_Parens_CachedRE2/16K 20000 85505 ns/op 191.61 MB/s
|
||||
Search_Parens_CachedRE2/32K 10000 164437 ns/op 199.27 MB/s
|
||||
Search_Parens_CachedRE2/64K 5000 332654 ns/op 197.01 MB/s
|
||||
Search_Parens_CachedRE2/128K 5000 677745 ns/op 193.39 MB/s
|
||||
Search_Parens_CachedRE2/256K 2000 1331012 ns/op 196.95 MB/s
|
||||
Search_Parens_CachedRE2/512K 1000 2692594 ns/op 194.71 MB/s
|
||||
Search_Parens_CachedRE2/1M 500 5355880 ns/op 195.78 MB/s
|
||||
Search_Parens_CachedRE2/2M 100 10822340 ns/op 193.78 MB/s
|
||||
Search_Parens_CachedRE2/4M 100 21464430 ns/op 195.41 MB/s
|
||||
Search_Parens_CachedRE2/8M 50 42875940 ns/op 195.65 MB/s
|
||||
Search_Parens_CachedRE2/16M 20 84654300 ns/op 198.19 MB/s
|
||||
Search_BigFixed_CachedPCRE/8 5000000 360 ns/op 22.21 MB/s
|
||||
Search_BigFixed_CachedPCRE/16 5000000 442 ns/op 36.15 MB/s
|
||||
Search_BigFixed_CachedPCRE/32 5000000 606 ns/op 52.73 MB/s
|
||||
Search_BigFixed_CachedPCRE/64 2000000 935 ns/op 68.39 MB/s
|
||||
Search_BigFixed_CachedPCRE/128 1000000 1525 ns/op 83.91 MB/s
|
||||
Search_BigFixed_CachedPCRE/256 1000000 2718 ns/op 94.18 MB/s
|
||||
Search_BigFixed_CachedPCRE/512 500000 5020 ns/op 101.98 MB/s
|
||||
Search_BigFixed_CachedPCRE/1K 200000 9761 ns/op 104.90 MB/s
|
||||
Search_BigFixed_CachedPCRE/2K 100000 19275 ns/op 106.25 MB/s
|
||||
Search_BigFixed_CachedPCRE/4K 50000 38488 ns/op 106.42 MB/s
|
||||
Search_BigFixed_CachedPCRE/8K 20000 76229 ns/op 107.46 MB/s
|
||||
Search_BigFixed_CachedPCRE/16K 10000 155350 ns/op 105.46 MB/s
|
||||
Search_BigFixed_CachedPCRE/32K 5000 309242 ns/op 105.96 MB/s
|
||||
Search_BigFixed_CachedRE2/8 10000000 194 ns/op 41.03 MB/s
|
||||
Search_BigFixed_CachedRE2/16 5000000 589 ns/op 27.15 MB/s
|
||||
Search_BigFixed_CachedRE2/32 5000000 655 ns/op 48.83 MB/s
|
||||
Search_BigFixed_CachedRE2/64 5000000 715 ns/op 89.46 MB/s
|
||||
Search_BigFixed_CachedRE2/128 2000000 882 ns/op 145.09 MB/s
|
||||
Search_BigFixed_CachedRE2/256 1000000 1293 ns/op 197.97 MB/s
|
||||
Search_BigFixed_CachedRE2/512 1000000 1924 ns/op 266.06 MB/s
|
||||
Search_BigFixed_CachedRE2/1K 500000 3294 ns/op 310.79 MB/s
|
||||
Search_BigFixed_CachedRE2/2K 500000 6057 ns/op 338.10 MB/s
|
||||
Search_BigFixed_CachedRE2/4K 200000 11475 ns/op 356.93 MB/s
|
||||
Search_BigFixed_CachedRE2/8K 100000 22395 ns/op 365.79 MB/s
|
||||
Search_BigFixed_CachedRE2/16K 50000 44333 ns/op 369.56 MB/s
|
||||
Search_BigFixed_CachedRE2/32K 20000 88061 ns/op 372.11 MB/s
|
||||
Search_BigFixed_CachedRE2/64K 10000 173649 ns/op 377.40 MB/s
|
||||
Search_BigFixed_CachedRE2/128K 5000 347251 ns/op 377.46 MB/s
|
||||
Search_BigFixed_CachedRE2/256K 2000 702561 ns/op 373.13 MB/s
|
||||
Search_BigFixed_CachedRE2/512K 1000 1408041 ns/op 372.35 MB/s
|
||||
Search_BigFixed_CachedRE2/1M 500 3003070 ns/op 349.17 MB/s
|
||||
Search_Success_PCRE/8 500000 3891 ns/op 2.06 MB/s
|
||||
Search_Success_PCRE/16 500000 3865 ns/op 4.14 MB/s
|
||||
Search_Success_PCRE/32 500000 3861 ns/op 8.29 MB/s
|
||||
Search_Success_PCRE/64 500000 3921 ns/op 16.32 MB/s
|
||||
Search_Success_PCRE/128 500000 4677 ns/op 27.37 MB/s
|
||||
Search_Success_PCRE/256 500000 5362 ns/op 47.73 MB/s
|
||||
Search_Success_PCRE/512 500000 7125 ns/op 71.85 MB/s
|
||||
Search_Success_PCRE/1K 200000 10643 ns/op 96.21 MB/s
|
||||
Search_Success_PCRE/2K 100000 17620 ns/op 116.23 MB/s
|
||||
Search_Success_PCRE/4K 50000 31657 ns/op 129.39 MB/s
|
||||
Search_Success_PCRE/8K 50000 59290 ns/op 138.17 MB/s
|
||||
Search_Success_PCRE/16K 10000 115346 ns/op 142.04 MB/s
|
||||
Search_Success_PCRE/32K 10000 225258 ns/op 145.47 MB/s
|
||||
Search_Success_PCRE/64K 5000 452994 ns/op 144.67 MB/s
|
||||
Search_Success_PCRE/128K 2000 904745 ns/op 144.87 MB/s
|
||||
Search_Success_PCRE/256K 1000 1786683 ns/op 146.72 MB/s
|
||||
Search_Success_PCRE/512K 500 3600316 ns/op 145.62 MB/s
|
||||
Search_Success_PCRE/1M 200 7413055 ns/op 141.45 MB/s
|
||||
Search_Success_PCRE/2M 100 15261930 ns/op 137.41 MB/s
|
||||
Search_Success_PCRE/4M 50 32827960 ns/op 127.77 MB/s
|
||||
Search_Success_PCRE/8M 20 73886450 ns/op 113.53 MB/s
|
||||
Search_Success_PCRE/16M 5 247881200 ns/op 67.68 MB/s
|
||||
Search_Success_RE2/8 100000 18948 ns/op 0.42 MB/s
|
||||
Search_Success_RE2/16 50000 40076 ns/op 0.40 MB/s
|
||||
Search_Success_RE2/32 50000 40543 ns/op 0.79 MB/s
|
||||
Search_Success_RE2/64 50000 40520 ns/op 1.58 MB/s
|
||||
Search_Success_RE2/128 50000 41222 ns/op 3.11 MB/s
|
||||
Search_Success_RE2/256 50000 41361 ns/op 6.19 MB/s
|
||||
Search_Success_RE2/512 50000 42418 ns/op 12.07 MB/s
|
||||
Search_Success_RE2/1K 50000 45239 ns/op 22.64 MB/s
|
||||
Search_Success_RE2/2K 50000 50568 ns/op 40.50 MB/s
|
||||
Search_Success_RE2/4K 50000 60722 ns/op 67.45 MB/s
|
||||
Search_Success_RE2/8K 20000 82046 ns/op 99.85 MB/s
|
||||
Search_Success_RE2/16K 10000 125412 ns/op 130.64 MB/s
|
||||
Search_Success_RE2/32K 10000 211805 ns/op 154.71 MB/s
|
||||
Search_Success_RE2/64K 5000 373132 ns/op 175.64 MB/s
|
||||
Search_Success_RE2/128K 2000 710166 ns/op 184.57 MB/s
|
||||
Search_Success_RE2/256K 2000 1392231 ns/op 188.29 MB/s
|
||||
Search_Success_RE2/512K 1000 2763051 ns/op 189.75 MB/s
|
||||
Search_Success_RE2/1M 500 5547628 ns/op 189.01 MB/s
|
||||
Search_Success_RE2/2M 100 11709090 ns/op 179.10 MB/s
|
||||
Search_Success_RE2/4M 50 25220160 ns/op 166.31 MB/s
|
||||
Search_Success_RE2/8M 20 59411600 ns/op 141.19 MB/s
|
||||
Search_Success_RE2/16M 5 219468600 ns/op 76.44 MB/s
|
||||
Search_Success_CachedPCRE/8 5000000 328 ns/op 24.35 MB/s
|
||||
Search_Success_CachedPCRE/16 5000000 389 ns/op 41.06 MB/s
|
||||
Search_Success_CachedPCRE/32 5000000 507 ns/op 63.11 MB/s
|
||||
Search_Success_CachedPCRE/64 2000000 754 ns/op 84.80 MB/s
|
||||
Search_Success_CachedPCRE/128 1000000 1164 ns/op 109.89 MB/s
|
||||
Search_Success_CachedPCRE/256 1000000 2051 ns/op 124.81 MB/s
|
||||
Search_Success_CachedPCRE/512 500000 3831 ns/op 133.64 MB/s
|
||||
Search_Success_CachedPCRE/1K 500000 7280 ns/op 140.66 MB/s
|
||||
Search_Success_CachedPCRE/2K 200000 14254 ns/op 143.67 MB/s
|
||||
Search_Success_CachedPCRE/4K 100000 28223 ns/op 145.13 MB/s
|
||||
Search_Success_CachedPCRE/8K 50000 55445 ns/op 147.75 MB/s
|
||||
Search_Success_CachedPCRE/16K 10000 112739 ns/op 145.33 MB/s
|
||||
Search_Success_CachedPCRE/32K 10000 219943 ns/op 148.98 MB/s
|
||||
Search_Success_CachedPCRE/64K 5000 440884 ns/op 148.65 MB/s
|
||||
Search_Success_CachedPCRE/128K 2000 898950 ns/op 145.81 MB/s
|
||||
Search_Success_CachedPCRE/256K 1000 1775905 ns/op 147.61 MB/s
|
||||
Search_Success_CachedPCRE/512K 500 3579178 ns/op 146.48 MB/s
|
||||
Search_Success_CachedPCRE/1M 200 7278075 ns/op 144.07 MB/s
|
||||
Search_Success_CachedPCRE/2M 100 14954670 ns/op 140.23 MB/s
|
||||
Search_Success_CachedPCRE/4M 50 31865060 ns/op 131.63 MB/s
|
||||
Search_Success_CachedPCRE/8M 20 73977900 ns/op 113.39 MB/s
|
||||
Search_Success_CachedPCRE/16M 5 250587400 ns/op 66.95 MB/s
|
||||
Search_Success_CachedRE2/8 10000000 206 ns/op 38.80 MB/s
|
||||
Search_Success_CachedRE2/16 5000000 598 ns/op 26.75 MB/s
|
||||
Search_Success_CachedRE2/32 5000000 675 ns/op 47.39 MB/s
|
||||
Search_Success_CachedRE2/64 2000000 847 ns/op 75.52 MB/s
|
||||
Search_Success_CachedRE2/128 1000000 1211 ns/op 105.65 MB/s
|
||||
Search_Success_CachedRE2/256 1000000 1886 ns/op 135.73 MB/s
|
||||
Search_Success_CachedRE2/512 500000 3123 ns/op 163.90 MB/s
|
||||
Search_Success_CachedRE2/1K 500000 5754 ns/op 177.94 MB/s
|
||||
Search_Success_CachedRE2/2K 200000 10929 ns/op 187.38 MB/s
|
||||
Search_Success_CachedRE2/4K 100000 20887 ns/op 196.10 MB/s
|
||||
Search_Success_CachedRE2/8K 50000 41295 ns/op 198.37 MB/s
|
||||
Search_Success_CachedRE2/16K 20000 82338 ns/op 198.98 MB/s
|
||||
Search_Success_CachedRE2/32K 10000 168893 ns/op 194.02 MB/s
|
||||
Search_Success_CachedRE2/64K 5000 337449 ns/op 194.21 MB/s
|
||||
Search_Success_CachedRE2/128K 5000 670247 ns/op 195.56 MB/s
|
||||
Search_Success_CachedRE2/256K 2000 1342666 ns/op 195.24 MB/s
|
||||
Search_Success_CachedRE2/512K 1000 2711677 ns/op 193.34 MB/s
|
||||
Search_Success_CachedRE2/1M 500 5403052 ns/op 194.07 MB/s
|
||||
Search_Success_CachedRE2/2M 100 11697250 ns/op 179.29 MB/s
|
||||
Search_Success_CachedRE2/4M 50 24796680 ns/op 169.15 MB/s
|
||||
Search_Success_CachedRE2/8M 20 59587450 ns/op 140.78 MB/s
|
||||
Search_Success_CachedRE2/16M 5 225415400 ns/op 74.43 MB/s
|
||||
Search_Success1_PCRE/8 500000 4063 ns/op 1.97 MB/s
|
||||
Search_Success1_PCRE/16 500000 4104 ns/op 3.90 MB/s
|
||||
Search_Success1_PCRE/32 500000 4162 ns/op 7.69 MB/s
|
||||
Search_Success1_PCRE/64 500000 4284 ns/op 14.94 MB/s
|
||||
Search_Success1_PCRE/128 500000 4857 ns/op 26.35 MB/s
|
||||
Search_Success1_PCRE/256 500000 5507 ns/op 46.48 MB/s
|
||||
Search_Success1_PCRE/512 500000 7203 ns/op 71.08 MB/s
|
||||
Search_Success1_PCRE/1K 200000 10470 ns/op 97.80 MB/s
|
||||
Search_Success1_PCRE/2K 100000 17455 ns/op 117.33 MB/s
|
||||
Search_Success1_PCRE/4K 50000 31564 ns/op 129.77 MB/s
|
||||
Search_Success1_PCRE/8K 50000 59112 ns/op 138.58 MB/s
|
||||
Search_Success1_PCRE/16K 10000 115903 ns/op 141.36 MB/s
|
||||
Search_Success1_PCRE/32K 10000 223311 ns/op 146.74 MB/s
|
||||
Search_Success1_PCRE/64K 5000 447509 ns/op 146.45 MB/s
|
||||
Search_Success1_PCRE/128K 2000 874543 ns/op 149.87 MB/s
|
||||
Search_Success1_PCRE/256K 1000 1836342 ns/op 142.75 MB/s
|
||||
Search_Success1_PCRE/512K 500 3636250 ns/op 144.18 MB/s
|
||||
Search_Success1_PCRE/1M 200 7256345 ns/op 144.50 MB/s
|
||||
Search_Success1_PCRE/2M 100 15093450 ns/op 138.94 MB/s
|
||||
Search_Success1_PCRE/4M 50 32167920 ns/op 130.39 MB/s
|
||||
Search_Success1_PCRE/8M 20 74735800 ns/op 112.24 MB/s
|
||||
Search_Success1_PCRE/16M 5 252818600 ns/op 66.36 MB/s
|
||||
Search_Success1_RE2/8 50000 51778 ns/op 0.15 MB/s
|
||||
Search_Success1_RE2/16 50000 50754 ns/op 0.32 MB/s
|
||||
Search_Success1_RE2/32 50000 51127 ns/op 0.63 MB/s
|
||||
Search_Success1_RE2/64 50000 51305 ns/op 1.25 MB/s
|
||||
Search_Success1_RE2/128 50000 51580 ns/op 2.48 MB/s
|
||||
Search_Success1_RE2/256 50000 52019 ns/op 4.92 MB/s
|
||||
Search_Success1_RE2/512 50000 53145 ns/op 9.63 MB/s
|
||||
Search_Success1_RE2/1K 50000 55871 ns/op 18.33 MB/s
|
||||
Search_Success1_RE2/2K 50000 61477 ns/op 33.31 MB/s
|
||||
Search_Success1_RE2/4K 50000 71875 ns/op 56.99 MB/s
|
||||
Search_Success1_RE2/8K 20000 94822 ns/op 86.39 MB/s
|
||||
Search_Success1_RE2/16K 10000 137021 ns/op 119.57 MB/s
|
||||
Search_Success1_RE2/32K 10000 220596 ns/op 148.54 MB/s
|
||||
Search_Success1_RE2/64K 5000 377808 ns/op 173.46 MB/s
|
||||
Search_Success1_RE2/128K 5000 707546 ns/op 185.25 MB/s
|
||||
Search_Success1_RE2/256K 2000 1367308 ns/op 191.72 MB/s
|
||||
Search_Success1_RE2/512K 1000 2729291 ns/op 192.10 MB/s
|
||||
Search_Success1_RE2/1M 500 5439634 ns/op 192.77 MB/s
|
||||
Search_Success1_RE2/2M 100 11626860 ns/op 180.37 MB/s
|
||||
Search_Success1_RE2/4M 50 24603160 ns/op 170.48 MB/s
|
||||
Search_Success1_RE2/8M 20 59001300 ns/op 142.18 MB/s
|
||||
Search_Success1_RE2/16M 5 219520200 ns/op 76.43 MB/s
|
||||
Search_Success1_Cached_PCRE/8 5000000 373 ns/op 21.41 MB/s
|
||||
Search_Success1_Cached_PCRE/16 5000000 437 ns/op 36.61 MB/s
|
||||
Search_Success1_Cached_PCRE/32 5000000 543 ns/op 58.84 MB/s
|
||||
Search_Success1_Cached_PCRE/64 2000000 784 ns/op 81.60 MB/s
|
||||
Search_Success1_Cached_PCRE/128 1000000 1193 ns/op 107.29 MB/s
|
||||
Search_Success1_Cached_PCRE/256 1000000 2044 ns/op 125.23 MB/s
|
||||
Search_Success1_Cached_PCRE/512 500000 3734 ns/op 137.10 MB/s
|
||||
Search_Success1_Cached_PCRE/1K 500000 7121 ns/op 143.78 MB/s
|
||||
Search_Success1_Cached_PCRE/2K 200000 13767 ns/op 148.76 MB/s
|
||||
Search_Success1_Cached_PCRE/4K 100000 27176 ns/op 150.72 MB/s
|
||||
Search_Success1_Cached_PCRE/8K 50000 54155 ns/op 151.27 MB/s
|
||||
Search_Success1_Cached_PCRE/16K 10000 109309 ns/op 149.89 MB/s
|
||||
Search_Success1_Cached_PCRE/32K 10000 215890 ns/op 151.78 MB/s
|
||||
Search_Success1_Cached_PCRE/64K 5000 432550 ns/op 151.51 MB/s
|
||||
Search_Success1_Cached_PCRE/128K 2000 870568 ns/op 150.56 MB/s
|
||||
Search_Success1_Cached_PCRE/256K 1000 1756215 ns/op 149.27 MB/s
|
||||
Search_Success1_Cached_PCRE/512K 500 3671994 ns/op 142.78 MB/s
|
||||
Search_Success1_Cached_PCRE/1M 200 7134810 ns/op 146.97 MB/s
|
||||
Search_Success1_Cached_PCRE/2M 100 14672580 ns/op 142.93 MB/s
|
||||
Search_Success1_Cached_PCRE/4M 50 31146040 ns/op 134.67 MB/s
|
||||
Search_Success1_Cached_PCRE/8M 20 72224500 ns/op 116.15 MB/s
|
||||
Search_Success1_Cached_PCRE/16M 5 243683800 ns/op 68.85 MB/s
|
||||
Search_Success1_Cached_RE2/8 5000000 544 ns/op 14.69 MB/s
|
||||
Search_Success1_Cached_RE2/16 5000000 583 ns/op 27.43 MB/s
|
||||
Search_Success1_Cached_RE2/32 5000000 661 ns/op 48.37 MB/s
|
||||
Search_Success1_Cached_RE2/64 2000000 818 ns/op 78.23 MB/s
|
||||
Search_Success1_Cached_RE2/128 1000000 1148 ns/op 111.40 MB/s
|
||||
Search_Success1_Cached_RE2/256 1000000 1778 ns/op 143.95 MB/s
|
||||
Search_Success1_Cached_RE2/512 500000 3036 ns/op 168.64 MB/s
|
||||
Search_Success1_Cached_RE2/1K 500000 5549 ns/op 184.53 MB/s
|
||||
Search_Success1_Cached_RE2/2K 200000 10580 ns/op 193.56 MB/s
|
||||
Search_Success1_Cached_RE2/4K 100000 20645 ns/op 198.39 MB/s
|
||||
Search_Success1_Cached_RE2/8K 50000 40775 ns/op 200.90 MB/s
|
||||
Search_Success1_Cached_RE2/16K 20000 81030 ns/op 202.20 MB/s
|
||||
Search_Success1_Cached_RE2/32K 10000 162338 ns/op 201.85 MB/s
|
||||
Search_Success1_Cached_RE2/64K 5000 324387 ns/op 202.03 MB/s
|
||||
Search_Success1_Cached_RE2/128K 5000 648468 ns/op 202.13 MB/s
|
||||
Search_Success1_Cached_RE2/256K 2000 1299439 ns/op 201.74 MB/s
|
||||
Search_Success1_Cached_RE2/512K 1000 2608958 ns/op 200.96 MB/s
|
||||
Search_Success1_Cached_RE2/1M 500 5263964 ns/op 199.20 MB/s
|
||||
Search_Success1_Cached_RE2/2M 200 10793175 ns/op 194.30 MB/s
|
||||
Search_Success1_Cached_RE2/4M 50 24138120 ns/op 173.76 MB/s
|
||||
Search_Success1_Cached_RE2/8M 20 58223300 ns/op 144.08 MB/s
|
||||
Search_Success1_Cached_RE2/16M 5 215741400 ns/op 77.77 MB/s
|
||||
Search_Digits_PCRE 500000 7534 ns/op
|
||||
Search_Digits_RE2 50000 44162 ns/op
|
||||
Parse_Digits_PCRE 200000 7664 ns/op
|
||||
Parse_Digits_RE2 100000 22595 ns/op
|
||||
Parse_CachedDigits_PCRE 5000000 721 ns/op
|
||||
Parse_CachedDigits_RE2 5000000 413 ns/op
|
||||
Parse_DigitDs_PCRE 500000 7095 ns/op
|
||||
Parse_DigitDs_RE2 100000 22259 ns/op
|
||||
Parse_CachedDigitDs_PCRE 5000000 704 ns/op
|
||||
Parse_CachedDigitDs_RE2 5000000 415 ns/op
|
||||
Parse_Split_PCRE 500000 5540 ns/op
|
||||
Parse_Split_RE2 100000 23817 ns/op
|
||||
Parse_CachedSplit_PCRE 5000000 490 ns/op
|
||||
Parse_CachedSplit_RE2 10000000 251 ns/op
|
||||
Parse_SplitHard_PCRE 500000 5410 ns/op
|
||||
Parse_SplitHard_RE2 100000 28518 ns/op
|
||||
Parse_CachedSplitHard_PCRE 5000000 488 ns/op
|
||||
Parse_CachedSplitHard_RE2 1000000 2489 ns/op
|
||||
Parse_CachedSplitBig1_PCRE 500 7171752 ns/op
|
||||
Parse_CachedSplitBig1_RE2 2000 990722 ns/op
|
||||
Parse_CachedSplitBig2_PCRE 5000 658331 ns/op
|
||||
Parse_CachedSplitBig2_RE2 20 81205250 ns/op
|
||||
BM_PCRE_Compile 500000 6443 ns/op
|
||||
BM_RE2_Compile 100000 24103 ns/op
|
||||
SearchPhone_CachedPCRE/8 1000000 2010 ns/op 3.98 MB/s
|
||||
SearchPhone_CachedPCRE/16 500000 3286 ns/op 4.87 MB/s
|
||||
SearchPhone_CachedPCRE/32 500000 5953 ns/op 5.37 MB/s
|
||||
SearchPhone_CachedPCRE/64 200000 11181 ns/op 5.72 MB/s
|
||||
SearchPhone_CachedPCRE/128 100000 21634 ns/op 5.92 MB/s
|
||||
SearchPhone_CachedPCRE/256 50000 42315 ns/op 6.05 MB/s
|
||||
SearchPhone_CachedPCRE/512 20000 83969 ns/op 6.10 MB/s
|
||||
SearchPhone_CachedPCRE/1K 10000 166005 ns/op 6.17 MB/s
|
||||
SearchPhone_CachedPCRE/2K 5000 327433 ns/op 6.25 MB/s
|
||||
SearchPhone_CachedPCRE/4K 5000 654794 ns/op 6.26 MB/s
|
||||
SearchPhone_CachedPCRE/8K 2000 1302747 ns/op 6.29 MB/s
|
||||
SearchPhone_CachedPCRE/16K 1000 2601137 ns/op 6.30 MB/s
|
||||
SearchPhone_CachedPCRE/32K 500 5170166 ns/op 6.34 MB/s
|
||||
SearchPhone_CachedPCRE/64K 100 10378910 ns/op 6.31 MB/s
|
||||
SearchPhone_CachedPCRE/128K 100 20783360 ns/op 6.31 MB/s
|
||||
SearchPhone_CachedPCRE/256K 50 41632940 ns/op 6.30 MB/s
|
||||
SearchPhone_CachedPCRE/512K 20 83663300 ns/op 6.27 MB/s
|
||||
SearchPhone_CachedPCRE/1M 10 167093400 ns/op 6.28 MB/s
|
||||
SearchPhone_CachedPCRE/2M 5 335078800 ns/op 6.26 MB/s
|
||||
SearchPhone_CachedPCRE/4M 5 673405400 ns/op 6.23 MB/s
|
||||
SearchPhone_CachedPCRE/8M 1 1335761000 ns/op 6.28 MB/s
|
||||
SearchPhone_CachedPCRE/16M 1 2682908000 ns/op 6.25 MB/s
|
||||
SearchPhone_CachedRE2/8 1000000 1470 ns/op 5.44 MB/s
|
||||
SearchPhone_CachedRE2/16 1000000 1496 ns/op 10.69 MB/s
|
||||
SearchPhone_CachedRE2/32 1000000 1570 ns/op 20.38 MB/s
|
||||
SearchPhone_CachedRE2/64 1000000 1770 ns/op 36.15 MB/s
|
||||
SearchPhone_CachedRE2/128 1000000 2082 ns/op 61.46 MB/s
|
||||
SearchPhone_CachedRE2/256 1000000 2701 ns/op 94.78 MB/s
|
||||
SearchPhone_CachedRE2/512 500000 3963 ns/op 129.19 MB/s
|
||||
SearchPhone_CachedRE2/1K 500000 6487 ns/op 157.85 MB/s
|
||||
SearchPhone_CachedRE2/2K 200000 11527 ns/op 177.67 MB/s
|
||||
SearchPhone_CachedRE2/4K 100000 21579 ns/op 189.81 MB/s
|
||||
SearchPhone_CachedRE2/8K 50000 41804 ns/op 195.96 MB/s
|
||||
SearchPhone_CachedRE2/16K 20000 82228 ns/op 199.25 MB/s
|
||||
SearchPhone_CachedRE2/32K 10000 163444 ns/op 200.48 MB/s
|
||||
SearchPhone_CachedRE2/64K 5000 325307 ns/op 201.46 MB/s
|
||||
SearchPhone_CachedRE2/128K 5000 648559 ns/op 202.10 MB/s
|
||||
SearchPhone_CachedRE2/256K 2000 1295574 ns/op 202.34 MB/s
|
||||
SearchPhone_CachedRE2/512K 1000 2591267 ns/op 202.33 MB/s
|
||||
SearchPhone_CachedRE2/1M 500 5178738 ns/op 202.48 MB/s
|
||||
SearchPhone_CachedRE2/2M 100 10389680 ns/op 201.85 MB/s
|
||||
SearchPhone_CachedRE2/4M 100 20851510 ns/op 201.15 MB/s
|
||||
SearchPhone_CachedRE2/8M 50 41763800 ns/op 200.86 MB/s
|
||||
SearchPhone_CachedRE2/16M 20 83492800 ns/op 200.94 MB/s
|
||||
EmptyPartialMatchPCRE 10000000 195 ns/op
|
||||
EmptyPartialMatchRE2 5000000 497 ns/op
|
||||
SimplePartialMatchPCRE 10000000 276 ns/op
|
||||
SimplePartialMatchRE2 5000000 548 ns/op
|
||||
HTTPPartialMatchPCRE 2000000 826 ns/op
|
||||
HTTPPartialMatchRE2 2000000 894 ns/op
|
||||
SmallHTTPPartialMatchPCRE 2000000 825 ns/op
|
||||
SmallHTTPPartialMatchRE2 2000000 895 ns/op
|
||||
DotMatchPCRE 2000000 810 ns/op
|
||||
DotMatchRE2 2000000 976 ns/op
|
||||
ASCIIMatchPCRE 5000000 604 ns/op
|
||||
ASCIIMatchRE2 2000000 976 ns/op
|
1475
third_party/re2/benchlog/benchlog.r70
vendored
1475
third_party/re2/benchlog/benchlog.r70
vendored
File diff suppressed because it is too large
Load Diff
1058
third_party/re2/benchlog/benchlog.wreck
vendored
1058
third_party/re2/benchlog/benchlog.wreck
vendored
File diff suppressed because it is too large
Load Diff
98
third_party/re2/benchlog/benchplot.py
vendored
98
third_party/re2/benchlog/benchplot.py
vendored
@ -1,98 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import argparse # for ArgumentParser
|
||||
import subprocess # for Popen
|
||||
import tempfile # for NamedTemporaryFile
|
||||
import os # for remove
|
||||
|
||||
class gnuplot(object):
|
||||
|
||||
output = "result.png"
|
||||
|
||||
script = """
|
||||
set terminal png size 1024, 768
|
||||
set output "{}.png"
|
||||
set title "re2 benchlog"
|
||||
set datafile separator ";"
|
||||
set grid x y
|
||||
set ylabel "MB/s"
|
||||
set autoscale
|
||||
plot """
|
||||
|
||||
template = """'{}' using 1:5:xticlabels(2) with linespoints linewidth 3 title "{}",\\\n"""
|
||||
|
||||
benchdata = dict()
|
||||
tempfiles = []
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, type, value, traceback):
|
||||
"""
|
||||
remove all temporary files
|
||||
"""
|
||||
|
||||
for filename in self.tempfiles:
|
||||
os.remove(filename)
|
||||
|
||||
def parse_re2_benchlog(self, filename):
|
||||
"""
|
||||
parse the input benchlog and return a dictionary contain bench data
|
||||
"""
|
||||
|
||||
benchdata = self.benchdata
|
||||
|
||||
with open(filename) as f:
|
||||
|
||||
for raw in f.readlines():
|
||||
|
||||
data = raw.split('\t')
|
||||
|
||||
if len(data) == 4:
|
||||
|
||||
data = data[0].split('/') + data[1:]
|
||||
data = list(map(str.strip, data))
|
||||
|
||||
if not benchdata.get(data[0]):
|
||||
benchdata[data[0]] = [ data[1:] ]
|
||||
else:
|
||||
benchdata[data[0]].append(data[1:])
|
||||
|
||||
def gen_csv(self):
|
||||
"""
|
||||
generate temporary csv files
|
||||
"""
|
||||
|
||||
for name, data in self.benchdata.items():
|
||||
|
||||
with tempfile.NamedTemporaryFile(delete=False) as f:
|
||||
|
||||
for index, line in enumerate(data):
|
||||
f.write('{};{}\n'.format(index, ';'.join(line)).encode())
|
||||
|
||||
self.tempfiles.append(f.name)
|
||||
self.script = self.script + self.template.format(f.name, name)
|
||||
|
||||
def run(self):
|
||||
self.gen_csv()
|
||||
script = self.script[:-3].format(self.output)
|
||||
command = subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
|
||||
command.communicate(script.encode())
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
parser = argparse.ArgumentParser(description='generate plots for benchlog')
|
||||
parser.add_argument('benchlog', type=str, help='benchlog generated by re2')
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
subprocess.Popen(['gnuplot'], stdin=subprocess.PIPE)
|
||||
except FileNotFoundError:
|
||||
print('you can install "gnuplot" to generate plots automatically')
|
||||
exit(1)
|
||||
|
||||
with gnuplot() as plot:
|
||||
plot.output = args.benchlog
|
||||
plot.parse_re2_benchlog(args.benchlog)
|
||||
plot.run()
|
155
third_party/re2/benchlog/mktable
vendored
155
third_party/re2/benchlog/mktable
vendored
@ -1,155 +0,0 @@
|
||||
#!/usr/bin/perl
|
||||
# XXX
|
||||
|
||||
sub table() {
|
||||
my ($name) = @_;
|
||||
print <<'EOF';
|
||||
<table border=0>
|
||||
<tr><th>System</th><th>PCRE</th><th>RE2</th></tr>
|
||||
EOF
|
||||
foreach my $sys (@sys) {
|
||||
my $ns_pcre = $data{$sys}->{sprintf($name, "PCRE")}->{'ns/op'};
|
||||
my $ns_re2 = $data{$sys}->{sprintf($name, "RE2")}->{'ns/op'};
|
||||
printf "<tr><td>%s</td><td>%.1f µs</td><td>%.1f µs</td></tr>\n", $sysname{$sys}, $ns_pcre/1000., $ns_re2/1000.;
|
||||
}
|
||||
print <<'EOF';
|
||||
<tr height=5><td colspan=3></td></tr>
|
||||
</table>
|
||||
EOF
|
||||
}
|
||||
|
||||
@sizes = (
|
||||
"8", "16", "32", "64", "128", "256", "512",
|
||||
"1K", "2K", "4K", "8K", "16K", "32K", "64K", "128K", "256K", "512K",
|
||||
"1M", "2M", "4M", "8M", "16M"
|
||||
);
|
||||
|
||||
%color = (
|
||||
"PCRE" => "0.7 0 0",
|
||||
"RE2" => "0 0 1",
|
||||
);
|
||||
|
||||
$ngraph = 0;
|
||||
|
||||
sub graph() {
|
||||
my ($name) = @_;
|
||||
|
||||
my $sys = "wreck";
|
||||
my $base = sprintf("regexp3g%d", ++$ngraph);
|
||||
|
||||
open(JGR, ">$base.jgr") || die "open >$base.jgr: $!";
|
||||
printf JGR "bbox -20 -12 392 95\n";
|
||||
printf JGR "newgraph clip x_translate 0.25 y_translate 0.25\n";
|
||||
$ymax = 0;
|
||||
%lastx = ();
|
||||
%lasty = ();
|
||||
foreach my $who ("PCRE", "RE2") {
|
||||
printf JGR "newcurve pts\n";
|
||||
for(my $i=0; $i<@sizes; $i++) {
|
||||
my $key = sprintf("%s%s/%s", $name, $who, $sizes[$i]);
|
||||
my $val = $data{$sys}->{$key}->{'MB/s'};
|
||||
next if !defined($val);
|
||||
if($val > $ymax) {
|
||||
$ymax = $val;
|
||||
}
|
||||
$lastx{$who} = $i;
|
||||
$lasty{$who} = $val;
|
||||
printf JGR "$i %f (* %s *)\n", $val, $key;
|
||||
}
|
||||
my $color = $color{$who};
|
||||
printf JGR "marktype none color $color linethickness 2 linetype solid label : $who\n";
|
||||
}
|
||||
my $n = @sizes;
|
||||
printf JGR "xaxis min -1 max $n size 5 label : text size (bytes)\n";
|
||||
printf JGR " no_auto_hash_marks hash_labels fontsize 9\n";
|
||||
for($i=0; $i<@sizes; $i+=3) {
|
||||
printf JGR " hash_at $i hash_label at $i : $sizes[$i]\n";
|
||||
}
|
||||
my $y = 1;
|
||||
while(10*$y <= $ymax) {
|
||||
$y = 10*$y;
|
||||
}
|
||||
for($i=2; $i<=10; $i++) {
|
||||
if($i*$y > $ymax) {
|
||||
$y = $i*$y;
|
||||
last;
|
||||
}
|
||||
}
|
||||
foreach my $who ("PCRE", "RE2") {
|
||||
$x1 = $lastx{$who};
|
||||
$y1 = $lasty{$who};
|
||||
$x1 *= 1.01;
|
||||
my $v = "vjc";
|
||||
if($y1 < 0.05 * $y) {
|
||||
$v = "vjb";
|
||||
$y1 = 0.05 * $y;
|
||||
}
|
||||
printf JGR "newstring x $x1 y $y1 hjl $v : $who\n";
|
||||
}
|
||||
printf JGR "yaxis min 0 max $y size 1 label : speed (MB/s)\n";
|
||||
printf JGR " hash_labels fontsize 9\n";
|
||||
# printf JGR "legend defaults font Times-Roman fontsize 10 x 0 y $y hjl vjt\n";
|
||||
|
||||
system("jgraph $base.jgr >$base.eps"); # die "system: $!";
|
||||
system("gs -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dEPSCrop -sDEVICE=png16m -r100 -sOutputFile=$base.png -dBATCH -dQUIT -dQUIET -dNOPAUSE $base.eps");
|
||||
|
||||
printf "<img src=$base.png>\n"
|
||||
|
||||
}
|
||||
|
||||
sub skip() {
|
||||
while(<>) {
|
||||
if(/^<!-- -->/) {
|
||||
print;
|
||||
last;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@sys = ("r70", "c2", "wreck", "mini");
|
||||
%sysname = (
|
||||
"r70" => "AMD Opteron 8214 HE, 2.2 GHz",
|
||||
"c2" => "Intel Core2 Duo E7200, 2.53 GHz",
|
||||
"wreck" => "Intel Xeon 5150, 2.66 GHz (Mac Pro)",
|
||||
"mini" => "Intel Core2 T5600, 1.83 GHz (Mac Mini)",
|
||||
);
|
||||
|
||||
%func = (
|
||||
"table" => \&table,
|
||||
"graph" => \&graph,
|
||||
|
||||
);
|
||||
|
||||
foreach my $sys (@sys) {
|
||||
open(F, "benchlog.$sys") || die "open benchlog.$sys: $!";
|
||||
my %sysdat;
|
||||
while(<F>) {
|
||||
if(/^([A-Za-z0-9_\/]+)\s+(\d+)\s+(\d+) ns\/op/) {
|
||||
my %row;
|
||||
$row{"name"} = $1;
|
||||
$row{"iter"} = $2;
|
||||
$row{"ns/op"} = $3;
|
||||
if(/([\d.]+) MB\/s/){
|
||||
$row{"MB/s"} = $1;
|
||||
}
|
||||
$sysdat{$row{"name"}} = \%row;
|
||||
}
|
||||
}
|
||||
close F;
|
||||
$data{$sys} = \%sysdat;
|
||||
}
|
||||
|
||||
while(<>) {
|
||||
print;
|
||||
if(/^<!-- benchlog (\w+) -->/) {
|
||||
$func{$1}();
|
||||
skip();
|
||||
next;
|
||||
}
|
||||
if(/^<!-- benchlog (\w+) ([%\w]+) -->/) {
|
||||
$func{$1}($2);
|
||||
skip();
|
||||
next;
|
||||
}
|
||||
}
|
||||
|
1
third_party/re2/doc/README.xkcd
vendored
1
third_party/re2/doc/README.xkcd
vendored
@ -1 +0,0 @@
|
||||
xkcd.png is a cropped version of http://xkcd.com/208/
|
41
third_party/re2/doc/mksyntaxgo
vendored
41
third_party/re2/doc/mksyntaxgo
vendored
@ -1,41 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
out=$GOROOT/src/regexp/syntax/doc.go
|
||||
cp syntax.txt $out
|
||||
sam -d $out <<'!'
|
||||
,x g/NOT SUPPORTED/d
|
||||
/^Unicode character class/,$d
|
||||
,s/[«»]//g
|
||||
,x g/^Possessive repetitions:/d
|
||||
,x g/\\C/d
|
||||
,x g/Flag syntax/d
|
||||
,s/.=(true|false)/flag &/g
|
||||
,s/^Flags:/ Flag syntax is xyz (set) or -xyz (clear) or xy-z (set xy, clear z). The flags are:\n/
|
||||
,s/\n\n\n+/\n\n/g
|
||||
,x/(^.* .*\n)+/ | awk -F' ' '{printf(" %-14s %s\n", $1, $2)}'
|
||||
1,2c
|
||||
// Copyright 2012 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// DO NOT EDIT. This file is generated by mksyntaxgo from the RE2 distribution.
|
||||
|
||||
/*
|
||||
Package syntax parses regular expressions into parse trees and compiles
|
||||
parse trees into programs. Most clients of regular expressions will use the
|
||||
facilities of package regexp (such as Compile and Match) instead of this package.
|
||||
|
||||
Syntax
|
||||
|
||||
The regular expression syntax understood by this package when parsing with the Perl flag is as follows.
|
||||
Parts of the syntax can be disabled by passing alternate flags to Parse.
|
||||
|
||||
.
|
||||
$a
|
||||
*/
|
||||
package syntax
|
||||
.
|
||||
w
|
||||
q
|
||||
!
|
42
third_party/re2/doc/mksyntaxhtml
vendored
42
third_party/re2/doc/mksyntaxhtml
vendored
@ -1,42 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
cp syntax.txt syntax.html
|
||||
sam -d syntax.html <<'!'
|
||||
,s/\&/\&/g
|
||||
,s/</\</g
|
||||
,s/>/\>/g
|
||||
,s!== (([^()]|\([^()]*\))*)!≡ <code>\1</code>!g
|
||||
,s!«!<code>!g
|
||||
,s!»!</code>!g
|
||||
,s! vim$! <font size=-2>VIM</font>!g
|
||||
,s! pcre$! <font size=-2>PCRE</font>!g
|
||||
,s! perl$! <font size=-2>PERL</font>!g
|
||||
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color=#808080>&</font>!
|
||||
,s!NOT SUPPORTED!!g
|
||||
,s!(^[^ ]+) (.*)\n!<tr><td><code>\1</code></td><td>\2</td></tr>\n!g
|
||||
,s!.*:$!<b>&</b>!g
|
||||
,s!^$!<tr><td></td></tr>!g
|
||||
,x v/<tr>/ s!.*!<tr><td colspan=2>&</td></tr>!
|
||||
1,2c
|
||||
<html>
|
||||
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<title>RE2 regular expression syntax reference</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>RE2 regular expression syntax reference</h1>
|
||||
|
||||
<table border=0 cellpadding=2 cellspacing=2>
|
||||
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
|
||||
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
|
||||
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
|
||||
.
|
||||
$a
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
||||
.
|
||||
w
|
||||
q
|
||||
!
|
36
third_party/re2/doc/mksyntaxwiki
vendored
36
third_party/re2/doc/mksyntaxwiki
vendored
@ -1,36 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
cp syntax.txt syntax.wiki
|
||||
sam -d syntax.wiki <<'!'
|
||||
,s!`!`````!g
|
||||
,s!== (([^()]|\([^()]*\))*)!≡ `\1`!g
|
||||
,s!«!`!g
|
||||
,s!»!`!g
|
||||
,s! vim$! <font size="1">VIM</font>!g
|
||||
,s! pcre$! <font size="1">PCRE</font>!g
|
||||
,s! perl$! <font size="1">PERL</font>!g
|
||||
,s!(^[^ ]+) (.*)\n!`\1` \2\n!g
|
||||
,x g/NOT SUPPORTED/ s!^[^ ]+!<font color="#808080">&</font>!
|
||||
,s!NOT SUPPORTED!<font size="1">(&)</font>!g
|
||||
,s!(^[^ ]+) (.*)\n!<tr><td>\1</td><td>\2</td></tr>\n!g
|
||||
,s!.*:$!<b>&</b>!g
|
||||
,s!^$!<tr><td></td></tr>!g
|
||||
,x v/<tr>/ s!.*!<tr><td colspan="2">&</td></tr>!
|
||||
1,2c
|
||||
#summary I define UNIX as “30 definitions of regular expressions living under one roof.” —Don Knuth
|
||||
|
||||
<wiki:comment>
|
||||
GENERATED BY mksyntaxwiki. DO NOT EDIT
|
||||
</wiki:comment>
|
||||
|
||||
<table border="0" cellpadding="2" cellspacing="2">
|
||||
<tr><td colspan="2">This page lists the regular expression syntax accepted by RE2.</td></tr>
|
||||
<tr><td colspan="2">It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
|
||||
<tr><td colspan="2">Grayed out expressions are not supported by RE2.</td></tr>
|
||||
.
|
||||
$a
|
||||
</table>
|
||||
.
|
||||
w
|
||||
q
|
||||
!
|
409
third_party/re2/doc/syntax.html
vendored
409
third_party/re2/doc/syntax.html
vendored
@ -1,409 +0,0 @@
|
||||
<html>
|
||||
<!-- AUTOMATICALLY GENERATED by mksyntaxhtml -->
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
|
||||
<title>RE2 regular expression syntax reference</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>RE2 regular expression syntax reference</h1>
|
||||
|
||||
<table border=0 cellpadding=2 cellspacing=2>
|
||||
<tr><td colspan=2>This page lists the regular expression syntax accepted by RE2.</td></tr>
|
||||
<tr><td colspan=2>It also lists syntax accepted by PCRE, PERL, and VIM.</td></tr>
|
||||
<tr><td colspan=2>Grayed out expressions are not supported by RE2.</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Single characters:</b></td></tr>
|
||||
<tr><td><code>.</code></td><td>any character, possibly including newline (s=true)</td></tr>
|
||||
<tr><td><code>[xyz]</code></td><td>character class</td></tr>
|
||||
<tr><td><code>[^xyz]</code></td><td>negated character class</td></tr>
|
||||
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
|
||||
<tr><td><code>\D</code></td><td>negated Perl character class</td></tr>
|
||||
<tr><td><code>[[:alpha:]]</code></td><td>ASCII character class</td></tr>
|
||||
<tr><td><code>[[:^alpha:]]</code></td><td>negated ASCII character class</td></tr>
|
||||
<tr><td><code>\pN</code></td><td>Unicode character class (one-letter name)</td></tr>
|
||||
<tr><td><code>\p{Greek}</code></td><td>Unicode character class</td></tr>
|
||||
<tr><td><code>\PN</code></td><td>negated Unicode character class (one-letter name)</td></tr>
|
||||
<tr><td><code>\P{Greek}</code></td><td>negated Unicode character class</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Composites:</b></td></tr>
|
||||
<tr><td><code>xy</code></td><td><code>x</code> followed by <code>y</code></td></tr>
|
||||
<tr><td><code>x|y</code></td><td><code>x</code> or <code>y</code> (prefer <code>x</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Repetitions:</b></td></tr>
|
||||
<tr><td><code>x*</code></td><td>zero or more <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x+</code></td><td>one or more <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x?</code></td><td>zero or one <code>x</code>, prefer one</td></tr>
|
||||
<tr><td><code>x{n,m}</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x{n,}</code></td><td><code>n</code> or more <code>x</code>, prefer more</td></tr>
|
||||
<tr><td><code>x{n}</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
|
||||
<tr><td><code>x*?</code></td><td>zero or more <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x+?</code></td><td>one or more <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x??</code></td><td>zero or one <code>x</code>, prefer zero</td></tr>
|
||||
<tr><td><code>x{n,m}?</code></td><td><code>n</code> or <code>n</code>+1 or ... or <code>m</code> <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x{n,}?</code></td><td><code>n</code> or more <code>x</code>, prefer fewer</td></tr>
|
||||
<tr><td><code>x{n}?</code></td><td>exactly <code>n</code> <code>x</code></td></tr>
|
||||
<tr><td><code><font color=#808080>x{}</font></code></td><td>(≡ <code>x*</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>x{-}</font></code></td><td>(≡ <code>x*?</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>x{-n}</font></code></td><td>(≡ <code>x{n}?</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>x=</font></code></td><td>(≡ <code>x?</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Possessive repetitions:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>x*+</font></code></td><td>zero or more <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x++</font></code></td><td>one or more <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x?+</font></code></td><td>zero or one <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x{n,m}+</font></code></td><td><code>n</code> or ... or <code>m</code> <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x{n,}+</font></code></td><td><code>n</code> or more <code>x</code>, possessive </td></tr>
|
||||
<tr><td><code><font color=#808080>x{n}+</font></code></td><td>exactly <code>n</code> <code>x</code>, possessive </td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Grouping:</b></td></tr>
|
||||
<tr><td><code>(re)</code></td><td>numbered capturing group</td></tr>
|
||||
<tr><td><code>(?P<name>re)</code></td><td>named & numbered capturing group</td></tr>
|
||||
<tr><td><code><font color=#808080>(?<name>re)</font></code></td><td>named & numbered capturing group </td></tr>
|
||||
<tr><td><code><font color=#808080>(?'name're)</font></code></td><td>named & numbered capturing group </td></tr>
|
||||
<tr><td><code>(?:re)</code></td><td>non-capturing group</td></tr>
|
||||
<tr><td><code>(?flags)</code></td><td>set flags within current group; non-capturing</td></tr>
|
||||
<tr><td><code>(?flags:re)</code></td><td>set flags during re; non-capturing</td></tr>
|
||||
<tr><td><code><font color=#808080>(?#text)</font></code></td><td>comment </td></tr>
|
||||
<tr><td><code><font color=#808080>(?|x|y|z)</font></code></td><td>branch numbering reset </td></tr>
|
||||
<tr><td><code><font color=#808080>(?>re)</font></code></td><td>possessive match of <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>re@></font></code></td><td>possessive match of <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>%(re)</font></code></td><td>non-capturing group <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Flags:</b></td></tr>
|
||||
<tr><td><code>i</code></td><td>case-insensitive (default false)</td></tr>
|
||||
<tr><td><code>m</code></td><td>multi-line mode: <code>^</code> and <code>$</code> match begin/end line in addition to begin/end text (default false)</td></tr>
|
||||
<tr><td><code>s</code></td><td>let <code>.</code> match <code>\n</code> (default false)</td></tr>
|
||||
<tr><td><code>U</code></td><td>ungreedy: swap meaning of <code>x*</code> and <code>x*?</code>, <code>x+</code> and <code>x+?</code>, etc (default false)</td></tr>
|
||||
<tr><td colspan=2>Flag syntax is <code>xyz</code> (set) or <code>-xyz</code> (clear) or <code>xy-z</code> (set <code>xy</code>, clear <code>z</code>).</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Empty strings:</b></td></tr>
|
||||
<tr><td><code>^</code></td><td>at beginning of text or line (<code>m</code>=true)</td></tr>
|
||||
<tr><td><code>$</code></td><td>at end of text (like <code>\z</code> not <code>\Z</code>) or line (<code>m</code>=true)</td></tr>
|
||||
<tr><td><code>\A</code></td><td>at beginning of text</td></tr>
|
||||
<tr><td><code>\b</code></td><td>at word boundary (<code>\w</code> on one side and <code>\W</code>, <code>\A</code>, or <code>\z</code> on the other)</td></tr>
|
||||
<tr><td><code>\B</code></td><td>not a word boundary</td></tr>
|
||||
<tr><td><code><font color=#808080>\G</font></code></td><td>at beginning of subtext being searched <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\G</font></code></td><td>at end of last match <font size=-2>PERL</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\Z</font></code></td><td>at end of text, or before newline at end of text </td></tr>
|
||||
<tr><td><code>\z</code></td><td>at end of text</td></tr>
|
||||
<tr><td><code><font color=#808080>(?=re)</font></code></td><td>before text matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?!re)</font></code></td><td>before text not matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?<=re)</font></code></td><td>after text matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?<!re)</font></code></td><td>after text not matching <code>re</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>re&</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@=</font></code></td><td>before text matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@!</font></code></td><td>before text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@<=</font></code></td><td>after text matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>re@<!</font></code></td><td>after text not matching <code>re</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\zs</font></code></td><td>sets start of match (= \K) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\ze</font></code></td><td>sets end of match <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%^</font></code></td><td>beginning of file <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%$</font></code></td><td>end of file <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%V</font></code></td><td>on screen <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%#</font></code></td><td>cursor position <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%'m</font></code></td><td>mark <code>m</code> position <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%23l</font></code></td><td>in line 23 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%23c</font></code></td><td>in column 23 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%23v</font></code></td><td>in virtual column 23 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Escape sequences:</b></td></tr>
|
||||
<tr><td><code>\a</code></td><td>bell (≡ <code>\007</code>)</td></tr>
|
||||
<tr><td><code>\f</code></td><td>form feed (≡ <code>\014</code>)</td></tr>
|
||||
<tr><td><code>\t</code></td><td>horizontal tab (≡ <code>\011</code>)</td></tr>
|
||||
<tr><td><code>\n</code></td><td>newline (≡ <code>\012</code>)</td></tr>
|
||||
<tr><td><code>\r</code></td><td>carriage return (≡ <code>\015</code>)</td></tr>
|
||||
<tr><td><code>\v</code></td><td>vertical tab character (≡ <code>\013</code>)</td></tr>
|
||||
<tr><td><code>\*</code></td><td>literal <code>*</code>, for any punctuation character <code>*</code></td></tr>
|
||||
<tr><td><code>\123</code></td><td>octal character code (up to three digits)</td></tr>
|
||||
<tr><td><code>\x7F</code></td><td>hex character code (exactly two digits)</td></tr>
|
||||
<tr><td><code>\x{10FFFF}</code></td><td>hex character code</td></tr>
|
||||
<tr><td><code>\C</code></td><td>match a single byte even in UTF-8 mode</td></tr>
|
||||
<tr><td><code>\Q...\E</code></td><td>literal text <code>...</code> even if <code>...</code> has punctuation</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td><code><font color=#808080>\1</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\b</font></code></td><td>backspace (use <code>\010</code>)</td></tr>
|
||||
<tr><td><code><font color=#808080>\cK</font></code></td><td>control char ^K (use <code>\001</code> etc)</td></tr>
|
||||
<tr><td><code><font color=#808080>\e</font></code></td><td>escape (use <code>\033</code>)</td></tr>
|
||||
<tr><td><code><font color=#808080>\g1</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{1}</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{+1}</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{-1}</font></code></td><td>backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g{name}</font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\g<name></font></code></td><td>subroutine call </td></tr>
|
||||
<tr><td><code><font color=#808080>\g'name'</font></code></td><td>subroutine call </td></tr>
|
||||
<tr><td><code><font color=#808080>\k<name></font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\k'name'</font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>\lX</font></code></td><td>lowercase <code>X</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\ux</font></code></td><td>uppercase <code>x</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\L...\E</font></code></td><td>lowercase text <code>...</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\K</font></code></td><td>reset beginning of <code>$0</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\N{name}</font></code></td><td>named Unicode character </td></tr>
|
||||
<tr><td><code><font color=#808080>\R</font></code></td><td>line break </td></tr>
|
||||
<tr><td><code><font color=#808080>\U...\E</font></code></td><td>upper case text <code>...</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>\X</font></code></td><td>extended Unicode sequence </td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td><code><font color=#808080>\%d123</font></code></td><td>decimal character 123 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%xFF</font></code></td><td>hex character FF <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%o123</font></code></td><td>octal character 123 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%u1234</font></code></td><td>Unicode character 0x1234 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\%U12345678</font></code></td><td>Unicode character 0x12345678 <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Character class elements:</b></td></tr>
|
||||
<tr><td><code>x</code></td><td>single character</td></tr>
|
||||
<tr><td><code>A-Z</code></td><td>character range (inclusive)</td></tr>
|
||||
<tr><td><code>\d</code></td><td>Perl character class</td></tr>
|
||||
<tr><td><code>[:foo:]</code></td><td>ASCII character class <code>foo</code></td></tr>
|
||||
<tr><td><code>\p{Foo}</code></td><td>Unicode character class <code>Foo</code></td></tr>
|
||||
<tr><td><code>\pF</code></td><td>Unicode character class <code>F</code> (one-letter name)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Named character classes as character class elements:</b></td></tr>
|
||||
<tr><td><code>[\d]</code></td><td>digits (≡ <code>\d</code>)</td></tr>
|
||||
<tr><td><code>[^\d]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
|
||||
<tr><td><code>[\D]</code></td><td>not digits (≡ <code>\D</code>)</td></tr>
|
||||
<tr><td><code>[^\D]</code></td><td>not not digits (≡ <code>\d</code>)</td></tr>
|
||||
<tr><td><code>[[:name:]]</code></td><td>named ASCII class inside character class (≡ <code>[:name:]</code>)</td></tr>
|
||||
<tr><td><code>[^[:name:]]</code></td><td>named ASCII class inside negated character class (≡ <code>[:^name:]</code>)</td></tr>
|
||||
<tr><td><code>[\p{Name}]</code></td><td>named Unicode property inside character class (≡ <code>\p{Name}</code>)</td></tr>
|
||||
<tr><td><code>[^\p{Name}]</code></td><td>named Unicode property inside negated character class (≡ <code>\P{Name}</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Perl character classes:</b></td></tr>
|
||||
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
|
||||
<tr><td><code>\D</code></td><td>not digits (≡ <code>[^0-9]</code>)</td></tr>
|
||||
<tr><td><code>\s</code></td><td>whitespace (≡ <code>[\t\n\f\r ]</code>)</td></tr>
|
||||
<tr><td><code>\S</code></td><td>not whitespace (≡ <code>[^\t\n\f\r ]</code>)</td></tr>
|
||||
<tr><td><code>\w</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
|
||||
<tr><td><code>\W</code></td><td>not word characters (≡ <code>[^0-9A-Za-z_]</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td><code><font color=#808080>\h</font></code></td><td>horizontal space </td></tr>
|
||||
<tr><td><code><font color=#808080>\H</font></code></td><td>not horizontal space </td></tr>
|
||||
<tr><td><code><font color=#808080>\v</font></code></td><td>vertical space </td></tr>
|
||||
<tr><td><code><font color=#808080>\V</font></code></td><td>not vertical space </td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>ASCII character classes:</b></td></tr>
|
||||
<tr><td><code>[[:alnum:]]</code></td><td>alphanumeric (≡ <code>[0-9A-Za-z]</code>)</td></tr>
|
||||
<tr><td><code>[[:alpha:]]</code></td><td>alphabetic (≡ <code>[A-Za-z]</code>)</td></tr>
|
||||
<tr><td><code>[[:ascii:]]</code></td><td>ASCII (≡ <code>[\x00-\x7F]</code>)</td></tr>
|
||||
<tr><td><code>[[:blank:]]</code></td><td>blank (≡ <code>[\t ]</code>)</td></tr>
|
||||
<tr><td><code>[[:cntrl:]]</code></td><td>control (≡ <code>[\x00-\x1F\x7F]</code>)</td></tr>
|
||||
<tr><td><code>[[:digit:]]</code></td><td>digits (≡ <code>[0-9]</code>)</td></tr>
|
||||
<tr><td><code>[[:graph:]]</code></td><td>graphical (≡ <code>[!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~]</code>)</td></tr>
|
||||
<tr><td><code>[[:lower:]]</code></td><td>lower case (≡ <code>[a-z]</code>)</td></tr>
|
||||
<tr><td><code>[[:print:]]</code></td><td>printable (≡ <code>[ -~] == [ [:graph:]]</code>)</td></tr>
|
||||
<tr><td><code>[[:punct:]]</code></td><td>punctuation (≡ <code>[!-/:-@[-`{-~]</code>)</td></tr>
|
||||
<tr><td><code>[[:space:]]</code></td><td>whitespace (≡ <code>[\t\n\v\f\r ]</code>)</td></tr>
|
||||
<tr><td><code>[[:upper:]]</code></td><td>upper case (≡ <code>[A-Z]</code>)</td></tr>
|
||||
<tr><td><code>[[:word:]]</code></td><td>word characters (≡ <code>[0-9A-Za-z_]</code>)</td></tr>
|
||||
<tr><td><code>[[:xdigit:]]</code></td><td>hex digit (≡ <code>[0-9A-Fa-f]</code>)</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Unicode character class names--general category:</b></td></tr>
|
||||
<tr><td><code>C</code></td><td>other</td></tr>
|
||||
<tr><td><code>Cc</code></td><td>control</td></tr>
|
||||
<tr><td><code>Cf</code></td><td>format</td></tr>
|
||||
<tr><td><code><font color=#808080>Cn</font></code></td><td>unassigned code points </td></tr>
|
||||
<tr><td><code>Co</code></td><td>private use</td></tr>
|
||||
<tr><td><code>Cs</code></td><td>surrogate</td></tr>
|
||||
<tr><td><code>L</code></td><td>letter</td></tr>
|
||||
<tr><td><code><font color=#808080>LC</font></code></td><td>cased letter </td></tr>
|
||||
<tr><td><code><font color=#808080>L&</font></code></td><td>cased letter </td></tr>
|
||||
<tr><td><code>Ll</code></td><td>lowercase letter</td></tr>
|
||||
<tr><td><code>Lm</code></td><td>modifier letter</td></tr>
|
||||
<tr><td><code>Lo</code></td><td>other letter</td></tr>
|
||||
<tr><td><code>Lt</code></td><td>titlecase letter</td></tr>
|
||||
<tr><td><code>Lu</code></td><td>uppercase letter</td></tr>
|
||||
<tr><td><code>M</code></td><td>mark</td></tr>
|
||||
<tr><td><code>Mc</code></td><td>spacing mark</td></tr>
|
||||
<tr><td><code>Me</code></td><td>enclosing mark</td></tr>
|
||||
<tr><td><code>Mn</code></td><td>non-spacing mark</td></tr>
|
||||
<tr><td><code>N</code></td><td>number</td></tr>
|
||||
<tr><td><code>Nd</code></td><td>decimal number</td></tr>
|
||||
<tr><td><code>Nl</code></td><td>letter number</td></tr>
|
||||
<tr><td><code>No</code></td><td>other number</td></tr>
|
||||
<tr><td><code>P</code></td><td>punctuation</td></tr>
|
||||
<tr><td><code>Pc</code></td><td>connector punctuation</td></tr>
|
||||
<tr><td><code>Pd</code></td><td>dash punctuation</td></tr>
|
||||
<tr><td><code>Pe</code></td><td>close punctuation</td></tr>
|
||||
<tr><td><code>Pf</code></td><td>final punctuation</td></tr>
|
||||
<tr><td><code>Pi</code></td><td>initial punctuation</td></tr>
|
||||
<tr><td><code>Po</code></td><td>other punctuation</td></tr>
|
||||
<tr><td><code>Ps</code></td><td>open punctuation</td></tr>
|
||||
<tr><td><code>S</code></td><td>symbol</td></tr>
|
||||
<tr><td><code>Sc</code></td><td>currency symbol</td></tr>
|
||||
<tr><td><code>Sk</code></td><td>modifier symbol</td></tr>
|
||||
<tr><td><code>Sm</code></td><td>math symbol</td></tr>
|
||||
<tr><td><code>So</code></td><td>other symbol</td></tr>
|
||||
<tr><td><code>Z</code></td><td>separator</td></tr>
|
||||
<tr><td><code>Zl</code></td><td>line separator</td></tr>
|
||||
<tr><td><code>Zp</code></td><td>paragraph separator</td></tr>
|
||||
<tr><td><code>Zs</code></td><td>space separator</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Unicode character class names--scripts:</b></td></tr>
|
||||
<tr><td><code>Arabic</code></td><td>Arabic</td></tr>
|
||||
<tr><td><code>Armenian</code></td><td>Armenian</td></tr>
|
||||
<tr><td><code>Balinese</code></td><td>Balinese</td></tr>
|
||||
<tr><td><code>Bamum</code></td><td>Bamum</td></tr>
|
||||
<tr><td><code>Batak</code></td><td>Batak</td></tr>
|
||||
<tr><td><code>Bengali</code></td><td>Bengali</td></tr>
|
||||
<tr><td><code>Bopomofo</code></td><td>Bopomofo</td></tr>
|
||||
<tr><td><code>Brahmi</code></td><td>Brahmi</td></tr>
|
||||
<tr><td><code>Braille</code></td><td>Braille</td></tr>
|
||||
<tr><td><code>Buginese</code></td><td>Buginese</td></tr>
|
||||
<tr><td><code>Buhid</code></td><td>Buhid</td></tr>
|
||||
<tr><td><code>Canadian_Aboriginal</code></td><td>Canadian Aboriginal</td></tr>
|
||||
<tr><td><code>Carian</code></td><td>Carian</td></tr>
|
||||
<tr><td><code>Chakma</code></td><td>Chakma</td></tr>
|
||||
<tr><td><code>Cham</code></td><td>Cham</td></tr>
|
||||
<tr><td><code>Cherokee</code></td><td>Cherokee</td></tr>
|
||||
<tr><td><code>Common</code></td><td>characters not specific to one script</td></tr>
|
||||
<tr><td><code>Coptic</code></td><td>Coptic</td></tr>
|
||||
<tr><td><code>Cuneiform</code></td><td>Cuneiform</td></tr>
|
||||
<tr><td><code>Cypriot</code></td><td>Cypriot</td></tr>
|
||||
<tr><td><code>Cyrillic</code></td><td>Cyrillic</td></tr>
|
||||
<tr><td><code>Deseret</code></td><td>Deseret</td></tr>
|
||||
<tr><td><code>Devanagari</code></td><td>Devanagari</td></tr>
|
||||
<tr><td><code>Egyptian_Hieroglyphs</code></td><td>Egyptian Hieroglyphs</td></tr>
|
||||
<tr><td><code>Ethiopic</code></td><td>Ethiopic</td></tr>
|
||||
<tr><td><code>Georgian</code></td><td>Georgian</td></tr>
|
||||
<tr><td><code>Glagolitic</code></td><td>Glagolitic</td></tr>
|
||||
<tr><td><code>Gothic</code></td><td>Gothic</td></tr>
|
||||
<tr><td><code>Greek</code></td><td>Greek</td></tr>
|
||||
<tr><td><code>Gujarati</code></td><td>Gujarati</td></tr>
|
||||
<tr><td><code>Gurmukhi</code></td><td>Gurmukhi</td></tr>
|
||||
<tr><td><code>Han</code></td><td>Han</td></tr>
|
||||
<tr><td><code>Hangul</code></td><td>Hangul</td></tr>
|
||||
<tr><td><code>Hanunoo</code></td><td>Hanunoo</td></tr>
|
||||
<tr><td><code>Hebrew</code></td><td>Hebrew</td></tr>
|
||||
<tr><td><code>Hiragana</code></td><td>Hiragana</td></tr>
|
||||
<tr><td><code>Imperial_Aramaic</code></td><td>Imperial Aramaic</td></tr>
|
||||
<tr><td><code>Inherited</code></td><td>inherit script from previous character</td></tr>
|
||||
<tr><td><code>Inscriptional_Pahlavi</code></td><td>Inscriptional Pahlavi</td></tr>
|
||||
<tr><td><code>Inscriptional_Parthian</code></td><td>Inscriptional Parthian</td></tr>
|
||||
<tr><td><code>Javanese</code></td><td>Javanese</td></tr>
|
||||
<tr><td><code>Kaithi</code></td><td>Kaithi</td></tr>
|
||||
<tr><td><code>Kannada</code></td><td>Kannada</td></tr>
|
||||
<tr><td><code>Katakana</code></td><td>Katakana</td></tr>
|
||||
<tr><td><code>Kayah_Li</code></td><td>Kayah Li</td></tr>
|
||||
<tr><td><code>Kharoshthi</code></td><td>Kharoshthi</td></tr>
|
||||
<tr><td><code>Khmer</code></td><td>Khmer</td></tr>
|
||||
<tr><td><code>Lao</code></td><td>Lao</td></tr>
|
||||
<tr><td><code>Latin</code></td><td>Latin</td></tr>
|
||||
<tr><td><code>Lepcha</code></td><td>Lepcha</td></tr>
|
||||
<tr><td><code>Limbu</code></td><td>Limbu</td></tr>
|
||||
<tr><td><code>Linear_B</code></td><td>Linear B</td></tr>
|
||||
<tr><td><code>Lycian</code></td><td>Lycian</td></tr>
|
||||
<tr><td><code>Lydian</code></td><td>Lydian</td></tr>
|
||||
<tr><td><code>Malayalam</code></td><td>Malayalam</td></tr>
|
||||
<tr><td><code>Mandaic</code></td><td>Mandaic</td></tr>
|
||||
<tr><td><code>Meetei_Mayek</code></td><td>Meetei Mayek</td></tr>
|
||||
<tr><td><code>Meroitic_Cursive</code></td><td>Meroitic Cursive</td></tr>
|
||||
<tr><td><code>Meroitic_Hieroglyphs</code></td><td>Meroitic Hieroglyphs</td></tr>
|
||||
<tr><td><code>Miao</code></td><td>Miao</td></tr>
|
||||
<tr><td><code>Mongolian</code></td><td>Mongolian</td></tr>
|
||||
<tr><td><code>Myanmar</code></td><td>Myanmar</td></tr>
|
||||
<tr><td><code>New_Tai_Lue</code></td><td>New Tai Lue (aka Simplified Tai Lue)</td></tr>
|
||||
<tr><td><code>Nko</code></td><td>Nko</td></tr>
|
||||
<tr><td><code>Ogham</code></td><td>Ogham</td></tr>
|
||||
<tr><td><code>Ol_Chiki</code></td><td>Ol Chiki</td></tr>
|
||||
<tr><td><code>Old_Italic</code></td><td>Old Italic</td></tr>
|
||||
<tr><td><code>Old_Persian</code></td><td>Old Persian</td></tr>
|
||||
<tr><td><code>Old_South_Arabian</code></td><td>Old South Arabian</td></tr>
|
||||
<tr><td><code>Old_Turkic</code></td><td>Old Turkic</td></tr>
|
||||
<tr><td><code>Oriya</code></td><td>Oriya</td></tr>
|
||||
<tr><td><code>Osmanya</code></td><td>Osmanya</td></tr>
|
||||
<tr><td><code>Phags_Pa</code></td><td>'Phags Pa</td></tr>
|
||||
<tr><td><code>Phoenician</code></td><td>Phoenician</td></tr>
|
||||
<tr><td><code>Rejang</code></td><td>Rejang</td></tr>
|
||||
<tr><td><code>Runic</code></td><td>Runic</td></tr>
|
||||
<tr><td><code>Saurashtra</code></td><td>Saurashtra</td></tr>
|
||||
<tr><td><code>Sharada</code></td><td>Sharada</td></tr>
|
||||
<tr><td><code>Shavian</code></td><td>Shavian</td></tr>
|
||||
<tr><td><code>Sinhala</code></td><td>Sinhala</td></tr>
|
||||
<tr><td><code>Sora_Sompeng</code></td><td>Sora Sompeng</td></tr>
|
||||
<tr><td><code>Sundanese</code></td><td>Sundanese</td></tr>
|
||||
<tr><td><code>Syloti_Nagri</code></td><td>Syloti Nagri</td></tr>
|
||||
<tr><td><code>Syriac</code></td><td>Syriac</td></tr>
|
||||
<tr><td><code>Tagalog</code></td><td>Tagalog</td></tr>
|
||||
<tr><td><code>Tagbanwa</code></td><td>Tagbanwa</td></tr>
|
||||
<tr><td><code>Tai_Le</code></td><td>Tai Le</td></tr>
|
||||
<tr><td><code>Tai_Tham</code></td><td>Tai Tham</td></tr>
|
||||
<tr><td><code>Tai_Viet</code></td><td>Tai Viet</td></tr>
|
||||
<tr><td><code>Takri</code></td><td>Takri</td></tr>
|
||||
<tr><td><code>Tamil</code></td><td>Tamil</td></tr>
|
||||
<tr><td><code>Telugu</code></td><td>Telugu</td></tr>
|
||||
<tr><td><code>Thaana</code></td><td>Thaana</td></tr>
|
||||
<tr><td><code>Thai</code></td><td>Thai</td></tr>
|
||||
<tr><td><code>Tibetan</code></td><td>Tibetan</td></tr>
|
||||
<tr><td><code>Tifinagh</code></td><td>Tifinagh</td></tr>
|
||||
<tr><td><code>Ugaritic</code></td><td>Ugaritic</td></tr>
|
||||
<tr><td><code>Vai</code></td><td>Vai</td></tr>
|
||||
<tr><td><code>Yi</code></td><td>Yi</td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Vim character classes:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>\i</font></code></td><td>identifier character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\I</font></code></td><td><code>\i</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\k</font></code></td><td>keyword character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\K</font></code></td><td><code>\k</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\f</font></code></td><td>file name character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\F</font></code></td><td><code>\f</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\p</font></code></td><td>printable character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\P</font></code></td><td><code>\p</code> except digits <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\s</font></code></td><td>whitespace character (≡ <code>[ \t]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\S</font></code></td><td>non-white space character (≡ <code>[^ \t]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\d</code></td><td>digits (≡ <code>[0-9]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\D</code></td><td>not <code>\d</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\x</font></code></td><td>hex digits (≡ <code>[0-9A-Fa-f]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\X</font></code></td><td>not <code>\x</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\o</font></code></td><td>octal digits (≡ <code>[0-7]</code>) <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\O</font></code></td><td>not <code>\o</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\w</code></td><td>word character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code>\W</code></td><td>not <code>\w</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\h</font></code></td><td>head of word character <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\H</font></code></td><td>not <code>\h</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\a</font></code></td><td>alphabetic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\A</font></code></td><td>not <code>\a</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\l</font></code></td><td>lowercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\L</font></code></td><td>not lowercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\u</font></code></td><td>uppercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\U</font></code></td><td>not uppercase <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\_x</font></code></td><td><code>\x</code> plus newline, for any <code>x</code> <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Vim flags:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>\c</font></code></td><td>ignore case <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\C</font></code></td><td>match case <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\m</font></code></td><td>magic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\M</font></code></td><td>nomagic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\v</font></code></td><td>verymagic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\V</font></code></td><td>verynomagic <font size=-2>VIM</font></td></tr>
|
||||
<tr><td><code><font color=#808080>\Z</font></code></td><td>ignore differences in Unicode combining characters <font size=-2>VIM</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
<tr><td colspan=2><b>Magic:</b></td></tr>
|
||||
<tr><td><code><font color=#808080>(?{code})</font></code></td><td>arbitrary Perl code <font size=-2>PERL</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(??{code})</font></code></td><td>postponed arbitrary Perl code <font size=-2>PERL</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(?n)</font></code></td><td>recursive call to regexp capturing group <code>n</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?+n)</font></code></td><td>recursive call to relative group <code>+n</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?-n)</font></code></td><td>recursive call to relative group <code>-n</code> </td></tr>
|
||||
<tr><td><code><font color=#808080>(?C)</font></code></td><td>PCRE callout <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(?R)</font></code></td><td>recursive call to entire regexp (≡ <code>(?0)</code>) </td></tr>
|
||||
<tr><td><code><font color=#808080>(?&name)</font></code></td><td>recursive call to named group </td></tr>
|
||||
<tr><td><code><font color=#808080>(?P=name)</font></code></td><td>named backreference </td></tr>
|
||||
<tr><td><code><font color=#808080>(?P>name)</font></code></td><td>recursive call to named group </td></tr>
|
||||
<tr><td><code><font color=#808080>(?(cond)true|false)</font></code></td><td>conditional branch </td></tr>
|
||||
<tr><td><code><font color=#808080>(?(cond)true)</font></code></td><td>conditional branch </td></tr>
|
||||
<tr><td><code><font color=#808080>(*ACCEPT)</font></code></td><td>make regexps more like Prolog </td></tr>
|
||||
<tr><td><code><font color=#808080>(*COMMIT)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*F)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*FAIL)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*MARK)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*PRUNE)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*SKIP)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*THEN)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*ANY)</font></code></td><td>set newline convention </td></tr>
|
||||
<tr><td><code><font color=#808080>(*ANYCRLF)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*CR)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*CRLF)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*LF)</font></code></td><td></td></tr>
|
||||
<tr><td><code><font color=#808080>(*BSR_ANYCRLF)</font></code></td><td>set \R convention <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td><code><font color=#808080>(*BSR_UNICODE)</font></code></td><td> <font size=-2>PCRE</font></td></tr>
|
||||
<tr><td></td></tr>
|
||||
</table>
|
||||
</body>
|
||||
</html>
|
399
third_party/re2/doc/syntax.txt
vendored
399
third_party/re2/doc/syntax.txt
vendored
@ -1,399 +0,0 @@
|
||||
RE2 regular expression syntax reference
|
||||
-------------------------------------
|
||||
|
||||
Single characters:
|
||||
. any character, possibly including newline (s=true)
|
||||
[xyz] character class
|
||||
[^xyz] negated character class
|
||||
\d Perl character class
|
||||
\D negated Perl character class
|
||||
[[:alpha:]] ASCII character class
|
||||
[[:^alpha:]] negated ASCII character class
|
||||
\pN Unicode character class (one-letter name)
|
||||
\p{Greek} Unicode character class
|
||||
\PN negated Unicode character class (one-letter name)
|
||||
\P{Greek} negated Unicode character class
|
||||
|
||||
Composites:
|
||||
xy «x» followed by «y»
|
||||
x|y «x» or «y» (prefer «x»)
|
||||
|
||||
Repetitions:
|
||||
x* zero or more «x», prefer more
|
||||
x+ one or more «x», prefer more
|
||||
x? zero or one «x», prefer one
|
||||
x{n,m} «n» or «n»+1 or ... or «m» «x», prefer more
|
||||
x{n,} «n» or more «x», prefer more
|
||||
x{n} exactly «n» «x»
|
||||
x*? zero or more «x», prefer fewer
|
||||
x+? one or more «x», prefer fewer
|
||||
x?? zero or one «x», prefer zero
|
||||
x{n,m}? «n» or «n»+1 or ... or «m» «x», prefer fewer
|
||||
x{n,}? «n» or more «x», prefer fewer
|
||||
x{n}? exactly «n» «x»
|
||||
x{} (== x*) NOT SUPPORTED vim
|
||||
x{-} (== x*?) NOT SUPPORTED vim
|
||||
x{-n} (== x{n}?) NOT SUPPORTED vim
|
||||
x= (== x?) NOT SUPPORTED vim
|
||||
|
||||
Implementation restriction: The counting forms «x{n,m}», «x{n,}», and «x{n}»
|
||||
reject forms that create a minimum or maximum repetition count above 1000.
|
||||
Unlimited repetitions are not subject to this restriction.
|
||||
|
||||
Possessive repetitions:
|
||||
x*+ zero or more «x», possessive NOT SUPPORTED
|
||||
x++ one or more «x», possessive NOT SUPPORTED
|
||||
x?+ zero or one «x», possessive NOT SUPPORTED
|
||||
x{n,m}+ «n» or ... or «m» «x», possessive NOT SUPPORTED
|
||||
x{n,}+ «n» or more «x», possessive NOT SUPPORTED
|
||||
x{n}+ exactly «n» «x», possessive NOT SUPPORTED
|
||||
|
||||
Grouping:
|
||||
(re) numbered capturing group (submatch)
|
||||
(?P<name>re) named & numbered capturing group (submatch)
|
||||
(?<name>re) named & numbered capturing group (submatch) NOT SUPPORTED
|
||||
(?'name're) named & numbered capturing group (submatch) NOT SUPPORTED
|
||||
(?:re) non-capturing group
|
||||
(?flags) set flags within current group; non-capturing
|
||||
(?flags:re) set flags during re; non-capturing
|
||||
(?#text) comment NOT SUPPORTED
|
||||
(?|x|y|z) branch numbering reset NOT SUPPORTED
|
||||
(?>re) possessive match of «re» NOT SUPPORTED
|
||||
re@> possessive match of «re» NOT SUPPORTED vim
|
||||
%(re) non-capturing group NOT SUPPORTED vim
|
||||
|
||||
Flags:
|
||||
i case-insensitive (default false)
|
||||
m multi-line mode: «^» and «$» match begin/end line in addition to begin/end text (default false)
|
||||
s let «.» match «\n» (default false)
|
||||
U ungreedy: swap meaning of «x*» and «x*?», «x+» and «x+?», etc (default false)
|
||||
Flag syntax is «xyz» (set) or «-xyz» (clear) or «xy-z» (set «xy», clear «z»).
|
||||
|
||||
Empty strings:
|
||||
^ at beginning of text or line («m»=true)
|
||||
$ at end of text (like «\z» not «\Z») or line («m»=true)
|
||||
\A at beginning of text
|
||||
\b at ASCII word boundary («\w» on one side and «\W», «\A», or «\z» on the other)
|
||||
\B not at ASCII word boundary
|
||||
\G at beginning of subtext being searched NOT SUPPORTED pcre
|
||||
\G at end of last match NOT SUPPORTED perl
|
||||
\Z at end of text, or before newline at end of text NOT SUPPORTED
|
||||
\z at end of text
|
||||
(?=re) before text matching «re» NOT SUPPORTED
|
||||
(?!re) before text not matching «re» NOT SUPPORTED
|
||||
(?<=re) after text matching «re» NOT SUPPORTED
|
||||
(?<!re) after text not matching «re» NOT SUPPORTED
|
||||
re& before text matching «re» NOT SUPPORTED vim
|
||||
re@= before text matching «re» NOT SUPPORTED vim
|
||||
re@! before text not matching «re» NOT SUPPORTED vim
|
||||
re@<= after text matching «re» NOT SUPPORTED vim
|
||||
re@<! after text not matching «re» NOT SUPPORTED vim
|
||||
\zs sets start of match (= \K) NOT SUPPORTED vim
|
||||
\ze sets end of match NOT SUPPORTED vim
|
||||
\%^ beginning of file NOT SUPPORTED vim
|
||||
\%$ end of file NOT SUPPORTED vim
|
||||
\%V on screen NOT SUPPORTED vim
|
||||
\%# cursor position NOT SUPPORTED vim
|
||||
\%'m mark «m» position NOT SUPPORTED vim
|
||||
\%23l in line 23 NOT SUPPORTED vim
|
||||
\%23c in column 23 NOT SUPPORTED vim
|
||||
\%23v in virtual column 23 NOT SUPPORTED vim
|
||||
|
||||
Escape sequences:
|
||||
\a bell (== \007)
|
||||
\f form feed (== \014)
|
||||
\t horizontal tab (== \011)
|
||||
\n newline (== \012)
|
||||
\r carriage return (== \015)
|
||||
\v vertical tab character (== \013)
|
||||
\* literal «*», for any punctuation character «*»
|
||||
\123 octal character code (up to three digits)
|
||||
\x7F hex character code (exactly two digits)
|
||||
\x{10FFFF} hex character code
|
||||
\C match a single byte even in UTF-8 mode
|
||||
\Q...\E literal text «...» even if «...» has punctuation
|
||||
|
||||
\1 backreference NOT SUPPORTED
|
||||
\b backspace NOT SUPPORTED (use «\010»)
|
||||
\cK control char ^K NOT SUPPORTED (use «\001» etc)
|
||||
\e escape NOT SUPPORTED (use «\033»)
|
||||
\g1 backreference NOT SUPPORTED
|
||||
\g{1} backreference NOT SUPPORTED
|
||||
\g{+1} backreference NOT SUPPORTED
|
||||
\g{-1} backreference NOT SUPPORTED
|
||||
\g{name} named backreference NOT SUPPORTED
|
||||
\g<name> subroutine call NOT SUPPORTED
|
||||
\g'name' subroutine call NOT SUPPORTED
|
||||
\k<name> named backreference NOT SUPPORTED
|
||||
\k'name' named backreference NOT SUPPORTED
|
||||
\lX lowercase «X» NOT SUPPORTED
|
||||
\ux uppercase «x» NOT SUPPORTED
|
||||
\L...\E lowercase text «...» NOT SUPPORTED
|
||||
\K reset beginning of «$0» NOT SUPPORTED
|
||||
\N{name} named Unicode character NOT SUPPORTED
|
||||
\R line break NOT SUPPORTED
|
||||
\U...\E upper case text «...» NOT SUPPORTED
|
||||
\X extended Unicode sequence NOT SUPPORTED
|
||||
|
||||
\%d123 decimal character 123 NOT SUPPORTED vim
|
||||
\%xFF hex character FF NOT SUPPORTED vim
|
||||
\%o123 octal character 123 NOT SUPPORTED vim
|
||||
\%u1234 Unicode character 0x1234 NOT SUPPORTED vim
|
||||
\%U12345678 Unicode character 0x12345678 NOT SUPPORTED vim
|
||||
|
||||
Character class elements:
|
||||
x single character
|
||||
A-Z character range (inclusive)
|
||||
\d Perl character class
|
||||
[:foo:] ASCII character class «foo»
|
||||
\p{Foo} Unicode character class «Foo»
|
||||
\pF Unicode character class «F» (one-letter name)
|
||||
|
||||
Named character classes as character class elements:
|
||||
[\d] digits (== \d)
|
||||
[^\d] not digits (== \D)
|
||||
[\D] not digits (== \D)
|
||||
[^\D] not not digits (== \d)
|
||||
[[:name:]] named ASCII class inside character class (== [:name:])
|
||||
[^[:name:]] named ASCII class inside negated character class (== [:^name:])
|
||||
[\p{Name}] named Unicode property inside character class (== \p{Name})
|
||||
[^\p{Name}] named Unicode property inside negated character class (== \P{Name})
|
||||
|
||||
Perl character classes (all ASCII-only):
|
||||
\d digits (== [0-9])
|
||||
\D not digits (== [^0-9])
|
||||
\s whitespace (== [\t\n\f\r ])
|
||||
\S not whitespace (== [^\t\n\f\r ])
|
||||
\w word characters (== [0-9A-Za-z_])
|
||||
\W not word characters (== [^0-9A-Za-z_])
|
||||
|
||||
\h horizontal space NOT SUPPORTED
|
||||
\H not horizontal space NOT SUPPORTED
|
||||
\v vertical space NOT SUPPORTED
|
||||
\V not vertical space NOT SUPPORTED
|
||||
|
||||
ASCII character classes:
|
||||
[[:alnum:]] alphanumeric (== [0-9A-Za-z])
|
||||
[[:alpha:]] alphabetic (== [A-Za-z])
|
||||
[[:ascii:]] ASCII (== [\x00-\x7F])
|
||||
[[:blank:]] blank (== [\t ])
|
||||
[[:cntrl:]] control (== [\x00-\x1F\x7F])
|
||||
[[:digit:]] digits (== [0-9])
|
||||
[[:graph:]] graphical (== [!-~] == [A-Za-z0-9!"#$%&'()*+,\-./:;<=>?@[\\\]^_`{|}~])
|
||||
[[:lower:]] lower case (== [a-z])
|
||||
[[:print:]] printable (== [ -~] == [ [:graph:]])
|
||||
[[:punct:]] punctuation (== [!-/:-@[-`{-~])
|
||||
[[:space:]] whitespace (== [\t\n\v\f\r ])
|
||||
[[:upper:]] upper case (== [A-Z])
|
||||
[[:word:]] word characters (== [0-9A-Za-z_])
|
||||
[[:xdigit:]] hex digit (== [0-9A-Fa-f])
|
||||
|
||||
Unicode character class names--general category:
|
||||
C other
|
||||
Cc control
|
||||
Cf format
|
||||
Cn unassigned code points NOT SUPPORTED
|
||||
Co private use
|
||||
Cs surrogate
|
||||
L letter
|
||||
LC cased letter NOT SUPPORTED
|
||||
L& cased letter NOT SUPPORTED
|
||||
Ll lowercase letter
|
||||
Lm modifier letter
|
||||
Lo other letter
|
||||
Lt titlecase letter
|
||||
Lu uppercase letter
|
||||
M mark
|
||||
Mc spacing mark
|
||||
Me enclosing mark
|
||||
Mn non-spacing mark
|
||||
N number
|
||||
Nd decimal number
|
||||
Nl letter number
|
||||
No other number
|
||||
P punctuation
|
||||
Pc connector punctuation
|
||||
Pd dash punctuation
|
||||
Pe close punctuation
|
||||
Pf final punctuation
|
||||
Pi initial punctuation
|
||||
Po other punctuation
|
||||
Ps open punctuation
|
||||
S symbol
|
||||
Sc currency symbol
|
||||
Sk modifier symbol
|
||||
Sm math symbol
|
||||
So other symbol
|
||||
Z separator
|
||||
Zl line separator
|
||||
Zp paragraph separator
|
||||
Zs space separator
|
||||
|
||||
Unicode character class names--scripts:
|
||||
Arabic Arabic
|
||||
Armenian Armenian
|
||||
Balinese Balinese
|
||||
Bamum Bamum
|
||||
Batak Batak
|
||||
Bengali Bengali
|
||||
Bopomofo Bopomofo
|
||||
Brahmi Brahmi
|
||||
Braille Braille
|
||||
Buginese Buginese
|
||||
Buhid Buhid
|
||||
Canadian_Aboriginal Canadian Aboriginal
|
||||
Carian Carian
|
||||
Chakma Chakma
|
||||
Cham Cham
|
||||
Cherokee Cherokee
|
||||
Common characters not specific to one script
|
||||
Coptic Coptic
|
||||
Cuneiform Cuneiform
|
||||
Cypriot Cypriot
|
||||
Cyrillic Cyrillic
|
||||
Deseret Deseret
|
||||
Devanagari Devanagari
|
||||
Egyptian_Hieroglyphs Egyptian Hieroglyphs
|
||||
Ethiopic Ethiopic
|
||||
Georgian Georgian
|
||||
Glagolitic Glagolitic
|
||||
Gothic Gothic
|
||||
Greek Greek
|
||||
Gujarati Gujarati
|
||||
Gurmukhi Gurmukhi
|
||||
Han Han
|
||||
Hangul Hangul
|
||||
Hanunoo Hanunoo
|
||||
Hebrew Hebrew
|
||||
Hiragana Hiragana
|
||||
Imperial_Aramaic Imperial Aramaic
|
||||
Inherited inherit script from previous character
|
||||
Inscriptional_Pahlavi Inscriptional Pahlavi
|
||||
Inscriptional_Parthian Inscriptional Parthian
|
||||
Javanese Javanese
|
||||
Kaithi Kaithi
|
||||
Kannada Kannada
|
||||
Katakana Katakana
|
||||
Kayah_Li Kayah Li
|
||||
Kharoshthi Kharoshthi
|
||||
Khmer Khmer
|
||||
Lao Lao
|
||||
Latin Latin
|
||||
Lepcha Lepcha
|
||||
Limbu Limbu
|
||||
Linear_B Linear B
|
||||
Lycian Lycian
|
||||
Lydian Lydian
|
||||
Malayalam Malayalam
|
||||
Mandaic Mandaic
|
||||
Meetei_Mayek Meetei Mayek
|
||||
Meroitic_Cursive Meroitic Cursive
|
||||
Meroitic_Hieroglyphs Meroitic Hieroglyphs
|
||||
Miao Miao
|
||||
Mongolian Mongolian
|
||||
Myanmar Myanmar
|
||||
New_Tai_Lue New Tai Lue (aka Simplified Tai Lue)
|
||||
Nko Nko
|
||||
Ogham Ogham
|
||||
Ol_Chiki Ol Chiki
|
||||
Old_Italic Old Italic
|
||||
Old_Persian Old Persian
|
||||
Old_South_Arabian Old South Arabian
|
||||
Old_Turkic Old Turkic
|
||||
Oriya Oriya
|
||||
Osmanya Osmanya
|
||||
Phags_Pa 'Phags Pa
|
||||
Phoenician Phoenician
|
||||
Rejang Rejang
|
||||
Runic Runic
|
||||
Saurashtra Saurashtra
|
||||
Sharada Sharada
|
||||
Shavian Shavian
|
||||
Sinhala Sinhala
|
||||
Sora_Sompeng Sora Sompeng
|
||||
Sundanese Sundanese
|
||||
Syloti_Nagri Syloti Nagri
|
||||
Syriac Syriac
|
||||
Tagalog Tagalog
|
||||
Tagbanwa Tagbanwa
|
||||
Tai_Le Tai Le
|
||||
Tai_Tham Tai Tham
|
||||
Tai_Viet Tai Viet
|
||||
Takri Takri
|
||||
Tamil Tamil
|
||||
Telugu Telugu
|
||||
Thaana Thaana
|
||||
Thai Thai
|
||||
Tibetan Tibetan
|
||||
Tifinagh Tifinagh
|
||||
Ugaritic Ugaritic
|
||||
Vai Vai
|
||||
Yi Yi
|
||||
|
||||
Vim character classes:
|
||||
\i identifier character NOT SUPPORTED vim
|
||||
\I «\i» except digits NOT SUPPORTED vim
|
||||
\k keyword character NOT SUPPORTED vim
|
||||
\K «\k» except digits NOT SUPPORTED vim
|
||||
\f file name character NOT SUPPORTED vim
|
||||
\F «\f» except digits NOT SUPPORTED vim
|
||||
\p printable character NOT SUPPORTED vim
|
||||
\P «\p» except digits NOT SUPPORTED vim
|
||||
\s whitespace character (== [ \t]) NOT SUPPORTED vim
|
||||
\S non-white space character (== [^ \t]) NOT SUPPORTED vim
|
||||
\d digits (== [0-9]) vim
|
||||
\D not «\d» vim
|
||||
\x hex digits (== [0-9A-Fa-f]) NOT SUPPORTED vim
|
||||
\X not «\x» NOT SUPPORTED vim
|
||||
\o octal digits (== [0-7]) NOT SUPPORTED vim
|
||||
\O not «\o» NOT SUPPORTED vim
|
||||
\w word character vim
|
||||
\W not «\w» vim
|
||||
\h head of word character NOT SUPPORTED vim
|
||||
\H not «\h» NOT SUPPORTED vim
|
||||
\a alphabetic NOT SUPPORTED vim
|
||||
\A not «\a» NOT SUPPORTED vim
|
||||
\l lowercase NOT SUPPORTED vim
|
||||
\L not lowercase NOT SUPPORTED vim
|
||||
\u uppercase NOT SUPPORTED vim
|
||||
\U not uppercase NOT SUPPORTED vim
|
||||
\_x «\x» plus newline, for any «x» NOT SUPPORTED vim
|
||||
|
||||
Vim flags:
|
||||
\c ignore case NOT SUPPORTED vim
|
||||
\C match case NOT SUPPORTED vim
|
||||
\m magic NOT SUPPORTED vim
|
||||
\M nomagic NOT SUPPORTED vim
|
||||
\v verymagic NOT SUPPORTED vim
|
||||
\V verynomagic NOT SUPPORTED vim
|
||||
\Z ignore differences in Unicode combining characters NOT SUPPORTED vim
|
||||
|
||||
Magic:
|
||||
(?{code}) arbitrary Perl code NOT SUPPORTED perl
|
||||
(??{code}) postponed arbitrary Perl code NOT SUPPORTED perl
|
||||
(?n) recursive call to regexp capturing group «n» NOT SUPPORTED
|
||||
(?+n) recursive call to relative group «+n» NOT SUPPORTED
|
||||
(?-n) recursive call to relative group «-n» NOT SUPPORTED
|
||||
(?C) PCRE callout NOT SUPPORTED pcre
|
||||
(?R) recursive call to entire regexp (== (?0)) NOT SUPPORTED
|
||||
(?&name) recursive call to named group NOT SUPPORTED
|
||||
(?P=name) named backreference NOT SUPPORTED
|
||||
(?P>name) recursive call to named group NOT SUPPORTED
|
||||
(?(cond)true|false) conditional branch NOT SUPPORTED
|
||||
(?(cond)true) conditional branch NOT SUPPORTED
|
||||
(*ACCEPT) make regexps more like Prolog NOT SUPPORTED
|
||||
(*COMMIT) NOT SUPPORTED
|
||||
(*F) NOT SUPPORTED
|
||||
(*FAIL) NOT SUPPORTED
|
||||
(*MARK) NOT SUPPORTED
|
||||
(*PRUNE) NOT SUPPORTED
|
||||
(*SKIP) NOT SUPPORTED
|
||||
(*THEN) NOT SUPPORTED
|
||||
(*ANY) set newline convention NOT SUPPORTED
|
||||
(*ANYCRLF) NOT SUPPORTED
|
||||
(*CR) NOT SUPPORTED
|
||||
(*CRLF) NOT SUPPORTED
|
||||
(*LF) NOT SUPPORTED
|
||||
(*BSR_ANYCRLF) set \R convention NOT SUPPORTED pcre
|
||||
(*BSR_UNICODE) NOT SUPPORTED pcre
|
||||
|
BIN
third_party/re2/doc/xkcd.png
vendored
BIN
third_party/re2/doc/xkcd.png
vendored
Binary file not shown.
Before ![]() (image error) Size: 26 KiB |
104
third_party/re2/lib/git/commit-msg.hook
vendored
104
third_party/re2/lib/git/commit-msg.hook
vendored
@ -1,104 +0,0 @@
|
||||
#!/bin/sh
|
||||
# From Gerrit Code Review 2.2.1
|
||||
#
|
||||
# Part of Gerrit Code Review (http://code.google.com/p/gerrit/)
|
||||
#
|
||||
# Copyright (C) 2009 The Android Open Source Project
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
CHANGE_ID_AFTER="Bug|Issue"
|
||||
MSG="$1"
|
||||
|
||||
# Check for, and add if missing, a unique Change-Id
|
||||
#
|
||||
add_ChangeId() {
|
||||
clean_message=`sed -e '
|
||||
/^diff --git a\/.*/{
|
||||
s///
|
||||
q
|
||||
}
|
||||
/^Signed-off-by:/d
|
||||
/^#/d
|
||||
' "$MSG" | git stripspace`
|
||||
if test -z "$clean_message"
|
||||
then
|
||||
return
|
||||
fi
|
||||
|
||||
if grep -i '^Change-Id:' "$MSG" >/dev/null
|
||||
then
|
||||
return
|
||||
fi
|
||||
|
||||
id=`_gen_ChangeId`
|
||||
perl -e '
|
||||
$MSG = shift;
|
||||
$id = shift;
|
||||
$CHANGE_ID_AFTER = shift;
|
||||
|
||||
undef $/;
|
||||
open(I, $MSG); $_ = <I>; close I;
|
||||
s|^diff --git a/.*||ms;
|
||||
s|^#.*$||mg;
|
||||
exit unless $_;
|
||||
|
||||
@message = split /\n/;
|
||||
$haveFooter = 0;
|
||||
$startFooter = @message;
|
||||
for($line = @message - 1; $line >= 0; $line--) {
|
||||
$_ = $message[$line];
|
||||
|
||||
if (/^[a-zA-Z0-9-]+:/ && !m,^[a-z0-9-]+://,) {
|
||||
$haveFooter++;
|
||||
next;
|
||||
}
|
||||
next if /^[ []/;
|
||||
$startFooter = $line if ($haveFooter && /^\r?$/);
|
||||
last;
|
||||
}
|
||||
|
||||
@footer = @message[$startFooter+1..@message];
|
||||
@message = @message[0..$startFooter];
|
||||
push(@footer, "") unless @footer;
|
||||
|
||||
for ($line = 0; $line < @footer; $line++) {
|
||||
$_ = $footer[$line];
|
||||
next if /^($CHANGE_ID_AFTER):/i;
|
||||
last;
|
||||
}
|
||||
splice(@footer, $line, 0, "Change-Id: I$id");
|
||||
|
||||
$_ = join("\n", @message, @footer);
|
||||
open(O, ">$MSG"); print O; close O;
|
||||
' "$MSG" "$id" "$CHANGE_ID_AFTER"
|
||||
}
|
||||
_gen_ChangeIdInput() {
|
||||
echo "tree `git write-tree`"
|
||||
if parent=`git rev-parse HEAD^0 2>/dev/null`
|
||||
then
|
||||
echo "parent $parent"
|
||||
fi
|
||||
echo "author `git var GIT_AUTHOR_IDENT`"
|
||||
echo "committer `git var GIT_COMMITTER_IDENT`"
|
||||
echo
|
||||
printf '%s' "$clean_message"
|
||||
}
|
||||
_gen_ChangeId() {
|
||||
_gen_ChangeIdInput |
|
||||
git hash-object -t commit --stdin
|
||||
}
|
||||
|
||||
|
||||
add_ChangeId
|
18
third_party/re2/libre2.symbols
vendored
18
third_party/re2/libre2.symbols
vendored
@ -1,18 +0,0 @@
|
||||
{
|
||||
global:
|
||||
# re2::RE2*
|
||||
_ZN3re23RE2*;
|
||||
_ZNK3re23RE2*;
|
||||
# re2::StringPiece*
|
||||
_ZN3re211StringPiece*;
|
||||
_ZNK3re211StringPiece*;
|
||||
# operator<<(std::ostream&, re2::StringPiece const&)
|
||||
_ZlsRSoRKN3re211StringPieceE;
|
||||
# re2::FilteredRE2*
|
||||
_ZN3re211FilteredRE2*;
|
||||
_ZNK3re211FilteredRE2*;
|
||||
# flags
|
||||
_ZN3re2*FLAGS_*;
|
||||
local:
|
||||
*;
|
||||
};
|
19
third_party/re2/libre2.symbols.darwin
vendored
19
third_party/re2/libre2.symbols.darwin
vendored
@ -1,19 +0,0 @@
|
||||
# Linker doesn't like these unmangled:
|
||||
# re2::RE2*
|
||||
__ZN3re23RE2*
|
||||
__ZNK3re23RE2*
|
||||
# re2::StringPiece*
|
||||
__ZN3re211StringPiece*
|
||||
__ZNK3re211StringPiece*
|
||||
# operator<<(std::ostream&, re2::StringPiece const&)
|
||||
# Seen with libstdc++ on 10.8 and below:
|
||||
# __ZlsRSoRKN3re211StringPieceE
|
||||
# Seen with libc++ on 10.9 and above:
|
||||
# __ZlsRNSt3__113basic_ostreamIcNS_11char_traitsIcEEEERKN3re211StringPieceE
|
||||
# Note that "ls" means operator<<, so this is not overly broad.
|
||||
__Zls*RKN3re211StringPieceE
|
||||
# re2::FilteredRE2*
|
||||
__ZN3re211FilteredRE2*
|
||||
__ZNK3re211FilteredRE2*
|
||||
# flags
|
||||
__ZN3re2*FLAGS_*
|
98
third_party/re2/re2.gyp
vendored
98
third_party/re2/re2.gyp
vendored
@ -11,12 +11,12 @@
|
||||
'target_name': 're2',
|
||||
'type': 'static_library',
|
||||
'include_dirs': [
|
||||
'.',
|
||||
'src',
|
||||
'<(DEPTH)',
|
||||
],
|
||||
'direct_dependent_settings': {
|
||||
'include_dirs': [
|
||||
'.',
|
||||
'src',
|
||||
'<(DEPTH)',
|
||||
],
|
||||
},
|
||||
@ -24,53 +24,53 @@
|
||||
'<(DEPTH)/base/third_party/dynamic_annotations/dynamic_annotations.gyp:dynamic_annotations',
|
||||
],
|
||||
'sources': [
|
||||
're2/bitstate.cc',
|
||||
're2/compile.cc',
|
||||
're2/dfa.cc',
|
||||
're2/filtered_re2.cc',
|
||||
're2/filtered_re2.h',
|
||||
're2/mimics_pcre.cc',
|
||||
're2/nfa.cc',
|
||||
're2/onepass.cc',
|
||||
're2/parse.cc',
|
||||
're2/perl_groups.cc',
|
||||
're2/prefilter.cc',
|
||||
're2/prefilter.h',
|
||||
're2/prefilter_tree.cc',
|
||||
're2/prefilter_tree.h',
|
||||
're2/prog.cc',
|
||||
're2/prog.h',
|
||||
're2/re2.cc',
|
||||
're2/re2.h',
|
||||
're2/regexp.cc',
|
||||
're2/regexp.h',
|
||||
're2/set.cc',
|
||||
're2/set.h',
|
||||
're2/simplify.cc',
|
||||
're2/stringpiece.cc',
|
||||
're2/stringpiece.h',
|
||||
're2/tostring.cc',
|
||||
're2/unicode_casefold.cc',
|
||||
're2/unicode_casefold.h',
|
||||
're2/unicode_groups.cc',
|
||||
're2/unicode_groups.h',
|
||||
're2/variadic_function.h',
|
||||
're2/walker-inl.h',
|
||||
'util/atomicops.h',
|
||||
'util/flags.h',
|
||||
'util/hash.cc',
|
||||
'util/logging.cc',
|
||||
'util/logging.h',
|
||||
'util/mutex.h',
|
||||
'util/rune.cc',
|
||||
'util/sparse_array.h',
|
||||
'util/sparse_set.h',
|
||||
'util/stringprintf.cc',
|
||||
'util/strutil.cc',
|
||||
'util/utf.h',
|
||||
'util/util.h',
|
||||
'util/valgrind.cc',
|
||||
'util/valgrind.h',
|
||||
'src/re2/bitstate.cc',
|
||||
'src/re2/compile.cc',
|
||||
'src/re2/dfa.cc',
|
||||
'src/re2/filtered_re2.cc',
|
||||
'src/re2/filtered_re2.h',
|
||||
'src/re2/mimics_pcre.cc',
|
||||
'src/re2/nfa.cc',
|
||||
'src/re2/onepass.cc',
|
||||
'src/re2/parse.cc',
|
||||
'src/re2/perl_groups.cc',
|
||||
'src/re2/prefilter.cc',
|
||||
'src/re2/prefilter.h',
|
||||
'src/re2/prefilter_tree.cc',
|
||||
'src/re2/prefilter_tree.h',
|
||||
'src/re2/prog.cc',
|
||||
'src/re2/prog.h',
|
||||
'src/re2/re2.cc',
|
||||
'src/re2/re2.h',
|
||||
'src/re2/regexp.cc',
|
||||
'src/re2/regexp.h',
|
||||
'src/re2/set.cc',
|
||||
'src/re2/set.h',
|
||||
'src/re2/simplify.cc',
|
||||
'src/re2/stringpiece.cc',
|
||||
'src/re2/stringpiece.h',
|
||||
'src/re2/tostring.cc',
|
||||
'src/re2/unicode_casefold.cc',
|
||||
'src/re2/unicode_casefold.h',
|
||||
'src/re2/unicode_groups.cc',
|
||||
'src/re2/unicode_groups.h',
|
||||
'src/re2/variadic_function.h',
|
||||
'src/re2/walker-inl.h',
|
||||
'src/util/atomicops.h',
|
||||
'src/util/flags.h',
|
||||
'src/util/hash.cc',
|
||||
'src/util/logging.cc',
|
||||
'src/util/logging.h',
|
||||
'src/util/mutex.h',
|
||||
'src/util/rune.cc',
|
||||
'src/util/sparse_array.h',
|
||||
'src/util/sparse_set.h',
|
||||
'src/util/stringprintf.cc',
|
||||
'src/util/strutil.cc',
|
||||
'src/util/utf.h',
|
||||
'src/util/util.h',
|
||||
'src/util/valgrind.cc',
|
||||
'src/util/valgrind.h',
|
||||
],
|
||||
'conditions': [
|
||||
['OS=="win"', {
|
||||
|
10
third_party/re2/re2.pc
vendored
10
third_party/re2/re2.pc
vendored
@ -1,10 +0,0 @@
|
||||
prefix=@prefix@
|
||||
exec_prefix=${prefix}
|
||||
includedir=${prefix}/include
|
||||
libdir=${exec_prefix}/lib
|
||||
|
||||
Name: re2
|
||||
Description: RE2 is a fast, safe, thread-friendly regular expression engine.
|
||||
Version: 0.0.0
|
||||
Cflags: -I${includedir}
|
||||
Libs: -L${libdir} -lre2 -pthread
|
381
third_party/re2/re2/bitstate.cc
vendored
381
third_party/re2/re2/bitstate.cc
vendored
@ -1,381 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
||||
|
||||
// Prog::SearchBitState is a regular expression search with submatch
|
||||
// tracking for small regular expressions and texts. Like
|
||||
// testing/backtrack.cc, it allocates a bit vector with (length of
|
||||
// text) * (length of prog) bits, to make sure it never explores the
|
||||
// same (character position, instruction) state multiple times. This
|
||||
// limits the search to run in time linear in the length of the text.
|
||||
//
|
||||
// Unlike testing/backtrack.cc, SearchBitState is not recursive
|
||||
// on the text.
|
||||
//
|
||||
// SearchBitState is a fast replacement for the NFA code on small
|
||||
// regexps and texts when SearchOnePass cannot be used.
|
||||
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct Job {
|
||||
int id;
|
||||
int arg;
|
||||
const char* p;
|
||||
};
|
||||
|
||||
class BitState {
|
||||
public:
|
||||
explicit BitState(Prog* prog);
|
||||
~BitState();
|
||||
|
||||
// The usual Search prototype.
|
||||
// Can only call Search once per BitState.
|
||||
bool Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch);
|
||||
|
||||
private:
|
||||
inline bool ShouldVisit(int id, const char* p);
|
||||
void Push(int id, const char* p, int arg);
|
||||
bool GrowStack();
|
||||
bool TrySearch(int id, const char* p);
|
||||
|
||||
// Search parameters
|
||||
Prog* prog_; // program being run
|
||||
StringPiece text_; // text being searched
|
||||
StringPiece context_; // greater context of text being searched
|
||||
bool anchored_; // whether search is anchored at text.begin()
|
||||
bool longest_; // whether search wants leftmost-longest match
|
||||
bool endmatch_; // whether match must end at text.end()
|
||||
StringPiece *submatch_; // submatches to fill in
|
||||
int nsubmatch_; // # of submatches to fill in
|
||||
|
||||
// Search state
|
||||
const char** cap_; // capture registers
|
||||
int ncap_;
|
||||
|
||||
static const int VisitedBits = 32;
|
||||
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
||||
int nvisited_; // # of words in bitmap
|
||||
|
||||
Job *job_; // stack of text positions to explore
|
||||
int njob_;
|
||||
int maxjob_;
|
||||
};
|
||||
|
||||
BitState::BitState(Prog* prog)
|
||||
: prog_(prog),
|
||||
anchored_(false),
|
||||
longest_(false),
|
||||
endmatch_(false),
|
||||
submatch_(NULL),
|
||||
nsubmatch_(0),
|
||||
cap_(NULL),
|
||||
ncap_(0),
|
||||
visited_(NULL),
|
||||
nvisited_(0),
|
||||
job_(NULL),
|
||||
njob_(0),
|
||||
maxjob_(0) {
|
||||
}
|
||||
|
||||
BitState::~BitState() {
|
||||
delete[] visited_;
|
||||
delete[] job_;
|
||||
delete[] cap_;
|
||||
}
|
||||
|
||||
// Should the search visit the pair ip, p?
|
||||
// If so, remember that it was visited so that the next time,
|
||||
// we don't repeat the visit.
|
||||
bool BitState::ShouldVisit(int id, const char* p) {
|
||||
size_t n = id * (text_.size() + 1) + (p - text_.begin());
|
||||
if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1))))
|
||||
return false;
|
||||
visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1));
|
||||
return true;
|
||||
}
|
||||
|
||||
// Grow the stack.
|
||||
bool BitState::GrowStack() {
|
||||
// VLOG(0) << "Reallocate.";
|
||||
maxjob_ *= 2;
|
||||
Job* newjob = new Job[maxjob_];
|
||||
memmove(newjob, job_, njob_*sizeof job_[0]);
|
||||
delete[] job_;
|
||||
job_ = newjob;
|
||||
if (njob_ >= maxjob_) {
|
||||
LOG(DFATAL) << "Job stack overflow.";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// Push the triple (id, p, arg) onto the stack, growing it if necessary.
|
||||
void BitState::Push(int id, const char* p, int arg) {
|
||||
if (njob_ >= maxjob_) {
|
||||
if (!GrowStack())
|
||||
return;
|
||||
}
|
||||
int op = prog_->inst(id)->opcode();
|
||||
if (op == kInstFail)
|
||||
return;
|
||||
|
||||
// Only check ShouldVisit when arg == 0.
|
||||
// When arg > 0, we are continuing a previous visit.
|
||||
if (arg == 0 && !ShouldVisit(id, p))
|
||||
return;
|
||||
|
||||
Job* j = &job_[njob_++];
|
||||
j->id = id;
|
||||
j->p = p;
|
||||
j->arg = arg;
|
||||
}
|
||||
|
||||
// Try a search from instruction id0 in state p0.
|
||||
// Return whether it succeeded.
|
||||
bool BitState::TrySearch(int id0, const char* p0) {
|
||||
bool matched = false;
|
||||
const char* end = text_.end();
|
||||
njob_ = 0;
|
||||
Push(id0, p0, 0);
|
||||
while (njob_ > 0) {
|
||||
// Pop job off stack.
|
||||
--njob_;
|
||||
int id = job_[njob_].id;
|
||||
const char* p = job_[njob_].p;
|
||||
int arg = job_[njob_].arg;
|
||||
|
||||
// Optimization: rather than push and pop,
|
||||
// code that is going to Push and continue
|
||||
// the loop simply updates ip, p, and arg
|
||||
// and jumps to CheckAndLoop. We have to
|
||||
// do the ShouldVisit check that Push
|
||||
// would have, but we avoid the stack
|
||||
// manipulation.
|
||||
if (0) {
|
||||
CheckAndLoop:
|
||||
if (!ShouldVisit(id, p))
|
||||
continue;
|
||||
}
|
||||
|
||||
// Visit ip, p.
|
||||
// VLOG(0) << "Job: " << ip->id() << " "
|
||||
// << (p - text_.begin()) << " " << arg;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
case kInstFail:
|
||||
return false;
|
||||
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg;
|
||||
return false;
|
||||
|
||||
case kInstAlt:
|
||||
// Cannot just
|
||||
// Push(ip->out1(), p, 0);
|
||||
// Push(ip->out(), p, 0);
|
||||
// If, during the processing of ip->out(), we encounter
|
||||
// ip->out1() via another path, we want to process it then.
|
||||
// Pushing it here will inhibit that. Instead, re-push
|
||||
// ip with arg==1 as a reminder to push ip->out1() later.
|
||||
switch (arg) {
|
||||
case 0:
|
||||
Push(id, p, 1); // come back when we're done
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case 1:
|
||||
// Finished ip->out(); try ip->out1().
|
||||
arg = 0;
|
||||
id = ip->out1();
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
||||
continue;
|
||||
|
||||
case kInstAltMatch:
|
||||
// One opcode is byte range; the other leads to match.
|
||||
if (ip->greedy(prog_)) {
|
||||
// out1 is the match
|
||||
Push(ip->out1(), p, 0);
|
||||
id = ip->out1();
|
||||
p = end;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
// out is the match - non-greedy
|
||||
Push(ip->out(), end, 0);
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstByteRange: {
|
||||
int c = -1;
|
||||
if (p < end)
|
||||
c = *p & 0xFF;
|
||||
if (ip->Matches(c)) {
|
||||
id = ip->out();
|
||||
p++;
|
||||
goto CheckAndLoop;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
switch (arg) {
|
||||
case 0:
|
||||
if (0 <= ip->cap() && ip->cap() < ncap_) {
|
||||
// Capture p to register, but save old value.
|
||||
Push(id, cap_[ip->cap()], 1); // come back when we're done
|
||||
cap_[ip->cap()] = p;
|
||||
}
|
||||
// Continue on.
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
case 1:
|
||||
// Finished ip->out(); restore the old value.
|
||||
cap_[ip->cap()] = p;
|
||||
continue;
|
||||
}
|
||||
LOG(DFATAL) << "Bad arg in kInstCapture: " << arg;
|
||||
continue;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||
continue;
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
goto CheckAndLoop;
|
||||
|
||||
case kInstMatch: {
|
||||
if (endmatch_ && p != text_.end())
|
||||
continue;
|
||||
|
||||
// VLOG(0) << "Found match.";
|
||||
// We found a match. If the caller doesn't care
|
||||
// where the match is, no point going further.
|
||||
if (nsubmatch_ == 0)
|
||||
return true;
|
||||
|
||||
// Record best match so far.
|
||||
// Only need to check end point, because this entire
|
||||
// call is only considering one start position.
|
||||
matched = true;
|
||||
cap_[1] = p;
|
||||
if (submatch_[0].data() == NULL ||
|
||||
(longest_ && p > submatch_[0].end())) {
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i].set(cap_[2*i],
|
||||
static_cast<int>(cap_[2*i+1] - cap_[2*i]));
|
||||
}
|
||||
|
||||
// If going for first match, we're done.
|
||||
if (!longest_)
|
||||
return true;
|
||||
|
||||
// If we used the entire text, no longer match is possible.
|
||||
if (p == text_.end())
|
||||
return true;
|
||||
|
||||
// Otherwise, continue on in hope of a longer match.
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
return matched;
|
||||
}
|
||||
|
||||
// Search text (within context) for prog_.
|
||||
bool BitState::Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch) {
|
||||
// Search parameters.
|
||||
text_ = text;
|
||||
context_ = context;
|
||||
if (context_.begin() == NULL)
|
||||
context_ = text;
|
||||
if (prog_->anchor_start() && context_.begin() != text.begin())
|
||||
return false;
|
||||
if (prog_->anchor_end() && context_.end() != text.end())
|
||||
return false;
|
||||
anchored_ = anchored || prog_->anchor_start();
|
||||
longest_ = longest || prog_->anchor_end();
|
||||
endmatch_ = prog_->anchor_end();
|
||||
submatch_ = submatch;
|
||||
nsubmatch_ = nsubmatch;
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i] = NULL;
|
||||
|
||||
// Allocate scratch space.
|
||||
nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits;
|
||||
visited_ = new uint32[nvisited_];
|
||||
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
||||
// VLOG(0) << "nvisited_ = " << nvisited_;
|
||||
|
||||
ncap_ = 2*nsubmatch;
|
||||
if (ncap_ < 2)
|
||||
ncap_ = 2;
|
||||
cap_ = new const char*[ncap_];
|
||||
memset(cap_, 0, ncap_*sizeof cap_[0]);
|
||||
|
||||
maxjob_ = 256;
|
||||
job_ = new Job[maxjob_];
|
||||
|
||||
// Anchored search must start at text.begin().
|
||||
if (anchored_) {
|
||||
cap_[0] = text.begin();
|
||||
return TrySearch(prog_->start(), text.begin());
|
||||
}
|
||||
|
||||
// Unanchored search, starting from each possible text position.
|
||||
// Notice that we have to try the empty string at the end of
|
||||
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
||||
// This looks like it's quadratic in the size of the text,
|
||||
// but we are not clearing visited_ between calls to TrySearch,
|
||||
// so no work is duplicated and it ends up still being linear.
|
||||
for (const char* p = text.begin(); p <= text.end(); p++) {
|
||||
cap_[0] = p;
|
||||
if (TrySearch(prog_->start(), p)) // Match must be leftmost; done.
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Bit-state search.
|
||||
bool Prog::SearchBitState(const StringPiece& text,
|
||||
const StringPiece& context,
|
||||
Anchor anchor,
|
||||
MatchKind kind,
|
||||
StringPiece* match,
|
||||
int nmatch) {
|
||||
// If full match, we ask for an anchored longest match
|
||||
// and then check that match[0] == text.
|
||||
// So make sure match[0] exists.
|
||||
StringPiece sp0;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch < 1) {
|
||||
match = &sp0;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the search.
|
||||
BitState b(this);
|
||||
bool anchored = anchor == kAnchored;
|
||||
bool longest = kind != kFirstMatch;
|
||||
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && match[0].end() != text.end())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
1151
third_party/re2/re2/compile.cc
vendored
1151
third_party/re2/re2/compile.cc
vendored
File diff suppressed because it is too large
Load Diff
2112
third_party/re2/re2/dfa.cc
vendored
2112
third_party/re2/re2/dfa.cc
vendored
File diff suppressed because it is too large
Load Diff
107
third_party/re2/re2/filtered_re2.cc
vendored
107
third_party/re2/re2/filtered_re2.cc
vendored
@ -1,107 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include <string>
|
||||
#include "util/util.h"
|
||||
#include "re2/filtered_re2.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/prefilter_tree.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
FilteredRE2::FilteredRE2()
|
||||
: compiled_(false),
|
||||
prefilter_tree_(new PrefilterTree()) {
|
||||
}
|
||||
|
||||
FilteredRE2::~FilteredRE2() {
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||
delete re2_vec_[i];
|
||||
delete prefilter_tree_;
|
||||
}
|
||||
|
||||
RE2::ErrorCode FilteredRE2::Add(const StringPiece& pattern,
|
||||
const RE2::Options& options, int* id) {
|
||||
RE2* re = new RE2(pattern, options);
|
||||
RE2::ErrorCode code = re->error_code();
|
||||
|
||||
if (!re->ok()) {
|
||||
if (options.log_errors()) {
|
||||
LOG(ERROR) << "Couldn't compile regular expression, skipping: "
|
||||
<< re << " due to error " << re->error();
|
||||
}
|
||||
delete re;
|
||||
} else {
|
||||
*id = static_cast<int>(re2_vec_.size());
|
||||
re2_vec_.push_back(re);
|
||||
}
|
||||
|
||||
return code;
|
||||
}
|
||||
|
||||
void FilteredRE2::Compile(vector<string>* atoms) {
|
||||
if (compiled_ || re2_vec_.size() == 0) {
|
||||
LOG(INFO) << "C: " << compiled_ << " S:" << re2_vec_.size();
|
||||
return;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++) {
|
||||
Prefilter* prefilter = Prefilter::FromRE2(re2_vec_[i]);
|
||||
prefilter_tree_->Add(prefilter);
|
||||
}
|
||||
atoms->clear();
|
||||
prefilter_tree_->Compile(atoms);
|
||||
compiled_ = true;
|
||||
}
|
||||
|
||||
int FilteredRE2::SlowFirstMatch(const StringPiece& text) const {
|
||||
for (size_t i = 0; i < re2_vec_.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[i]))
|
||||
return static_cast<int>(i);
|
||||
return -1;
|
||||
}
|
||||
|
||||
int FilteredRE2::FirstMatch(const StringPiece& text,
|
||||
const vector<int>& atoms) const {
|
||||
if (!compiled_) {
|
||||
LOG(DFATAL) << "FirstMatch called before Compile";
|
||||
return -1;
|
||||
}
|
||||
vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (size_t i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
return regexps[i];
|
||||
return -1;
|
||||
}
|
||||
|
||||
bool FilteredRE2::AllMatches(
|
||||
const StringPiece& text,
|
||||
const vector<int>& atoms,
|
||||
vector<int>* matching_regexps) const {
|
||||
matching_regexps->clear();
|
||||
vector<int> regexps;
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, ®exps);
|
||||
for (size_t i = 0; i < regexps.size(); i++)
|
||||
if (RE2::PartialMatch(text, *re2_vec_[regexps[i]]))
|
||||
matching_regexps->push_back(regexps[i]);
|
||||
return !matching_regexps->empty();
|
||||
}
|
||||
|
||||
void FilteredRE2::AllPotentials(
|
||||
const vector<int>& atoms,
|
||||
vector<int>* potential_regexps) const {
|
||||
prefilter_tree_->RegexpsGivenStrings(atoms, potential_regexps);
|
||||
}
|
||||
|
||||
void FilteredRE2::RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* passed_regexps) {
|
||||
prefilter_tree_->RegexpsGivenStrings(matched_atoms, passed_regexps);
|
||||
}
|
||||
|
||||
void FilteredRE2::PrintPrefilter(int regexpid) {
|
||||
prefilter_tree_->PrintPrefilter(regexpid);
|
||||
}
|
||||
|
||||
} // namespace re2
|
109
third_party/re2/re2/filtered_re2.h
vendored
109
third_party/re2/re2/filtered_re2.h
vendored
@ -1,109 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// The class FilteredRE2 is used as a wrapper to multiple RE2 regexps.
|
||||
// It provides a prefilter mechanism that helps in cutting down the
|
||||
// number of regexps that need to be actually searched.
|
||||
//
|
||||
// By design, it does not include a string matching engine. This is to
|
||||
// allow the user of the class to use their favorite string match
|
||||
// engine. The overall flow is: Add all the regexps using Add, then
|
||||
// Compile the FilteredRE2. The compile returns strings that need to
|
||||
// be matched. Note that all returned strings are lowercase. For
|
||||
// applying regexps to a search text, the caller does the string
|
||||
// matching using the strings returned. When doing the string match,
|
||||
// note that the caller has to do that on lower cased version of the
|
||||
// search text. Then call FirstMatch or AllMatches with a vector of
|
||||
// indices of strings that were found in the text to get the actual
|
||||
// regexp matches.
|
||||
|
||||
#ifndef RE2_FILTERED_RE2_H_
|
||||
#define RE2_FILTERED_RE2_H_
|
||||
|
||||
#include <vector>
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
using std::vector;
|
||||
|
||||
class PrefilterTree;
|
||||
|
||||
class FilteredRE2 {
|
||||
public:
|
||||
FilteredRE2();
|
||||
~FilteredRE2();
|
||||
|
||||
// Uses RE2 constructor to create a RE2 object (re). Returns
|
||||
// re->error_code(). If error_code is other than NoError, then re is
|
||||
// deleted and not added to re2_vec_.
|
||||
RE2::ErrorCode Add(const StringPiece& pattern,
|
||||
const RE2::Options& options,
|
||||
int *id);
|
||||
|
||||
// Prepares the regexps added by Add for filtering. Returns a set
|
||||
// of strings that the caller should check for in candidate texts.
|
||||
// The returned strings are lowercased. When doing string matching,
|
||||
// the search text should be lowercased first to find matching
|
||||
// strings from the set of strings returned by Compile. Call after
|
||||
// all Add calls are done.
|
||||
void Compile(vector<string>* strings_to_match);
|
||||
|
||||
// Returns the index of the first matching regexp.
|
||||
// Returns -1 on no match. Can be called prior to Compile.
|
||||
// Does not do any filtering: simply tries to Match the
|
||||
// regexps in a loop.
|
||||
int SlowFirstMatch(const StringPiece& text) const;
|
||||
|
||||
// Returns the index of the first matching regexp.
|
||||
// Returns -1 on no match. Compile has to be called before
|
||||
// calling this.
|
||||
int FirstMatch(const StringPiece& text,
|
||||
const vector<int>& atoms) const;
|
||||
|
||||
// Returns the indices of all matching regexps, after first clearing
|
||||
// matched_regexps.
|
||||
bool AllMatches(const StringPiece& text,
|
||||
const vector<int>& atoms,
|
||||
vector<int>* matching_regexps) const;
|
||||
|
||||
// Returns the indices of all potentially matching regexps after first
|
||||
// clearing potential_regexps.
|
||||
// A regexp is potentially matching if it passes the filter.
|
||||
// If a regexp passes the filter it may still not match.
|
||||
// A regexp that does not pass the filter is guaranteed to not match.
|
||||
void AllPotentials(const vector<int>& atoms,
|
||||
vector<int>* potential_regexps) const;
|
||||
|
||||
// The number of regexps added.
|
||||
int NumRegexps() const { return static_cast<int>(re2_vec_.size()); }
|
||||
|
||||
private:
|
||||
|
||||
// Get the individual RE2 objects. Useful for testing.
|
||||
RE2* GetRE2(int regexpid) const { return re2_vec_[regexpid]; }
|
||||
|
||||
// Print prefilter.
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
// Useful for testing and debugging.
|
||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* passed_regexps);
|
||||
|
||||
// All the regexps in the FilteredRE2.
|
||||
vector<RE2*> re2_vec_;
|
||||
|
||||
// Has the FilteredRE2 been compiled using Compile()
|
||||
bool compiled_;
|
||||
|
||||
// An AND-OR tree of string atoms used for filtering regexps.
|
||||
PrefilterTree* prefilter_tree_;
|
||||
|
||||
//DISALLOW_COPY_AND_ASSIGN(FilteredRE2);
|
||||
FilteredRE2(const FilteredRE2&);
|
||||
void operator=(const FilteredRE2&);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_FILTERED_RE2_H_
|
116
third_party/re2/re2/make_perl_groups.pl
vendored
116
third_party/re2/re2/make_perl_groups.pl
vendored
@ -1,116 +0,0 @@
|
||||
#!/usr/bin/perl
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# Generate table entries giving character ranges
|
||||
# for POSIX/Perl character classes. Rather than
|
||||
# figure out what the definition is, it is easier to ask
|
||||
# Perl about each letter from 0-128 and write down
|
||||
# its answer.
|
||||
|
||||
@posixclasses = (
|
||||
"[:alnum:]",
|
||||
"[:alpha:]",
|
||||
"[:ascii:]",
|
||||
"[:blank:]",
|
||||
"[:cntrl:]",
|
||||
"[:digit:]",
|
||||
"[:graph:]",
|
||||
"[:lower:]",
|
||||
"[:print:]",
|
||||
"[:punct:]",
|
||||
"[:space:]",
|
||||
"[:upper:]",
|
||||
"[:word:]",
|
||||
"[:xdigit:]",
|
||||
);
|
||||
|
||||
@perlclasses = (
|
||||
"\\d",
|
||||
"\\s",
|
||||
"\\w",
|
||||
);
|
||||
|
||||
%overrides = (
|
||||
# Prior to Perl 5.18, \s did not match vertical tab.
|
||||
# RE2 preserves that original behaviour.
|
||||
"\\s:11" => 0,
|
||||
);
|
||||
|
||||
sub ComputeClass($) {
|
||||
my ($cname) = @_;
|
||||
my @ranges;
|
||||
my $regexp = qr/[$cname]/;
|
||||
my $start = -1;
|
||||
for (my $i=0; $i<=129; $i++) {
|
||||
if ($i == 129) { $i = 256; }
|
||||
if ($i <= 128 && ($overrides{"$cname:$i"} // chr($i) =~ $regexp)) {
|
||||
if ($start < 0) {
|
||||
$start = $i;
|
||||
}
|
||||
} else {
|
||||
if ($start >= 0) {
|
||||
push @ranges, [$start, $i-1];
|
||||
}
|
||||
$start = -1;
|
||||
}
|
||||
}
|
||||
return @ranges;
|
||||
}
|
||||
|
||||
sub PrintClass($$@) {
|
||||
my ($cnum, $cname, @ranges) = @_;
|
||||
print "static const URange16 code${cnum}[] = { /* $cname */\n";
|
||||
for (my $i=0; $i<@ranges; $i++) {
|
||||
my @a = @{$ranges[$i]};
|
||||
printf "\t{ 0x%x, 0x%x },\n", $a[0], $a[1];
|
||||
}
|
||||
print "};\n";
|
||||
my $n = @ranges;
|
||||
my $escname = $cname;
|
||||
$escname =~ s/\\/\\\\/g;
|
||||
$negname = $escname;
|
||||
if ($negname =~ /:/) {
|
||||
$negname =~ s/:/:^/;
|
||||
} else {
|
||||
$negname =~ y/a-z/A-Z/;
|
||||
}
|
||||
return "{ \"$escname\", +1, code$cnum, $n }", "{ \"$negname\", -1, code$cnum, $n }";
|
||||
}
|
||||
|
||||
my $cnum = 0;
|
||||
|
||||
sub PrintClasses($@) {
|
||||
my ($pname, @classes) = @_;
|
||||
my @entries;
|
||||
foreach my $cname (@classes) {
|
||||
my @ranges = ComputeClass($cname);
|
||||
push @entries, PrintClass(++$cnum, $cname, @ranges);
|
||||
}
|
||||
print "const UGroup ${pname}_groups[] = {\n";
|
||||
foreach my $e (@entries) {
|
||||
print "\t$e,\n";
|
||||
}
|
||||
print "};\n";
|
||||
my $count = @entries;
|
||||
print "const int num_${pname}_groups = $count;\n";
|
||||
}
|
||||
|
||||
print <<EOF;
|
||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
||||
// make_perl_groups.pl >perl_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
EOF
|
||||
|
||||
PrintClasses("perl", @perlclasses);
|
||||
PrintClasses("posix", @posixclasses);
|
||||
|
||||
print <<EOF;
|
||||
|
||||
} // namespace re2
|
||||
EOF
|
147
third_party/re2/re2/make_unicode_casefold.py
vendored
147
third_party/re2/re2/make_unicode_casefold.py
vendored
@ -1,147 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# coding=utf-8
|
||||
#
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
# See unicode_casefold.h for description of case folding tables.
|
||||
|
||||
"""Generate C++ table for Unicode case folding."""
|
||||
|
||||
import sys
|
||||
import unicode
|
||||
|
||||
_header = """
|
||||
// GENERATED BY make_unicode_casefold.py; DO NOT EDIT.
|
||||
// make_unicode_casefold.py >unicode_casefold.cc
|
||||
|
||||
#include "re2/unicode_casefold.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
"""
|
||||
|
||||
_trailer = """
|
||||
|
||||
} // namespace re2
|
||||
|
||||
"""
|
||||
|
||||
def _Delta(a, b):
|
||||
"""Compute the delta for b - a. Even/odd and odd/even
|
||||
are handled specially, as described above."""
|
||||
if a+1 == b:
|
||||
if a%2 == 0:
|
||||
return 'EvenOdd'
|
||||
else:
|
||||
return 'OddEven'
|
||||
if a == b+1:
|
||||
if a%2 == 0:
|
||||
return 'OddEven'
|
||||
else:
|
||||
return 'EvenOdd'
|
||||
return b - a
|
||||
|
||||
def _AddDelta(a, delta):
|
||||
"""Return a + delta, handling EvenOdd and OddEven specially."""
|
||||
if type(delta) == int:
|
||||
return a+delta
|
||||
if delta == 'EvenOdd':
|
||||
if a%2 == 0:
|
||||
return a+1
|
||||
else:
|
||||
return a-1
|
||||
if delta == 'OddEven':
|
||||
if a%2 == 1:
|
||||
return a+1
|
||||
else:
|
||||
return a-1
|
||||
print >>sys.stderr, "Bad Delta: ", delta
|
||||
raise "Bad Delta"
|
||||
|
||||
def _MakeRanges(pairs):
|
||||
"""Turn a list like [(65,97), (66, 98), ..., (90,122)]
|
||||
into [(65, 90, +32)]."""
|
||||
ranges = []
|
||||
last = -100
|
||||
|
||||
def evenodd(last, a, b, r):
|
||||
if a != last+1 or b != _AddDelta(a, r[2]):
|
||||
return False
|
||||
r[1] = a
|
||||
return True
|
||||
|
||||
def evenoddpair(last, a, b, r):
|
||||
if a != last+2:
|
||||
return False
|
||||
delta = r[2]
|
||||
d = delta
|
||||
if type(delta) is not str:
|
||||
return False
|
||||
if delta.endswith('Skip'):
|
||||
d = delta[:-4]
|
||||
else:
|
||||
delta = d + 'Skip'
|
||||
if b != _AddDelta(a, d):
|
||||
return False
|
||||
r[1] = a
|
||||
r[2] = delta
|
||||
return True
|
||||
|
||||
for a, b in pairs:
|
||||
if ranges and evenodd(last, a, b, ranges[-1]):
|
||||
pass
|
||||
elif ranges and evenoddpair(last, a, b, ranges[-1]):
|
||||
pass
|
||||
else:
|
||||
ranges.append([a, a, _Delta(a, b)])
|
||||
last = a
|
||||
return ranges
|
||||
|
||||
# The maximum size of a case-folding group.
|
||||
# Case folding is implemented in parse.cc by a recursive process
|
||||
# with a recursion depth equal to the size of the largest
|
||||
# case-folding group, so it is important that this bound be small.
|
||||
# The current tables have no group bigger than 4.
|
||||
# If there are ever groups bigger than 10 or so, it will be
|
||||
# time to rework the code in parse.cc.
|
||||
MaxCasefoldGroup = 4
|
||||
|
||||
def main():
|
||||
lowergroups, casegroups = unicode.CaseGroups()
|
||||
foldpairs = []
|
||||
seen = {}
|
||||
for c in casegroups:
|
||||
if len(c) > MaxCasefoldGroup:
|
||||
raise unicode.Error("casefold group too long: %s" % (c,))
|
||||
for i in range(len(c)):
|
||||
if c[i-1] in seen:
|
||||
raise unicode.Error("bad casegroups %d -> %d" % (c[i-1], c[i]))
|
||||
seen[c[i-1]] = True
|
||||
foldpairs.append([c[i-1], c[i]])
|
||||
|
||||
lowerpairs = []
|
||||
for lower, group in lowergroups.iteritems():
|
||||
for g in group:
|
||||
if g != lower:
|
||||
lowerpairs.append([g, lower])
|
||||
|
||||
def printpairs(name, foldpairs):
|
||||
foldpairs.sort()
|
||||
foldranges = _MakeRanges(foldpairs)
|
||||
print "// %d groups, %d pairs, %d ranges" % (len(casegroups), len(foldpairs), len(foldranges))
|
||||
print "const CaseFold unicode_%s[] = {" % (name,)
|
||||
for lo, hi, delta in foldranges:
|
||||
print "\t{ %d, %d, %s }," % (lo, hi, delta)
|
||||
print "};"
|
||||
print "const int num_unicode_%s = %d;" % (name, len(foldranges),)
|
||||
print ""
|
||||
|
||||
print _header
|
||||
printpairs("casefold", foldpairs)
|
||||
printpairs("tolower", lowerpairs)
|
||||
print _trailer
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
111
third_party/re2/re2/make_unicode_groups.py
vendored
111
third_party/re2/re2/make_unicode_groups.py
vendored
@ -1,111 +0,0 @@
|
||||
#!/usr/bin/python
|
||||
# Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
# Use of this source code is governed by a BSD-style
|
||||
# license that can be found in the LICENSE file.
|
||||
|
||||
"""Generate C++ tables for Unicode Script and Category groups."""
|
||||
|
||||
import sys
|
||||
import unicode
|
||||
|
||||
_header = """
|
||||
// GENERATED BY make_unicode_groups.py; DO NOT EDIT.
|
||||
// make_unicode_groups.py >unicode_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
"""
|
||||
|
||||
_trailer = """
|
||||
|
||||
} // namespace re2
|
||||
|
||||
"""
|
||||
|
||||
n16 = 0
|
||||
n32 = 0
|
||||
|
||||
def MakeRanges(codes):
|
||||
"""Turn a list like [1,2,3,7,8,9] into a range list [[1,3], [7,9]]"""
|
||||
ranges = []
|
||||
last = -100
|
||||
for c in codes:
|
||||
if c == last+1:
|
||||
ranges[-1][1] = c
|
||||
else:
|
||||
ranges.append([c, c])
|
||||
last = c
|
||||
return ranges
|
||||
|
||||
def PrintRanges(type, name, ranges):
|
||||
"""Print the ranges as an array of type named name."""
|
||||
print "static const %s %s[] = {" % (type, name,)
|
||||
for lo, hi in ranges:
|
||||
print "\t{ %d, %d }," % (lo, hi)
|
||||
print "};"
|
||||
|
||||
# def PrintCodes(type, name, codes):
|
||||
# """Print the codes as an array of type named name."""
|
||||
# print "static %s %s[] = {" % (type, name,)
|
||||
# for c in codes:
|
||||
# print "\t%d," % (c,)
|
||||
# print "};"
|
||||
|
||||
def PrintGroup(name, codes):
|
||||
"""Print the data structures for the group of codes.
|
||||
Return a UGroup literal for the group."""
|
||||
|
||||
# See unicode_groups.h for a description of the data structure.
|
||||
|
||||
# Split codes into 16-bit ranges and 32-bit ranges.
|
||||
range16 = MakeRanges([c for c in codes if c < 65536])
|
||||
range32 = MakeRanges([c for c in codes if c >= 65536])
|
||||
|
||||
# Pull singleton ranges out of range16.
|
||||
# code16 = [lo for lo, hi in range16 if lo == hi]
|
||||
# range16 = [[lo, hi] for lo, hi in range16 if lo != hi]
|
||||
|
||||
global n16
|
||||
global n32
|
||||
n16 += len(range16)
|
||||
n32 += len(range32)
|
||||
|
||||
ugroup = "{ \"%s\", +1" % (name,)
|
||||
# if len(code16) > 0:
|
||||
# PrintCodes("uint16", name+"_code16", code16)
|
||||
# ugroup += ", %s_code16, %d" % (name, len(code16))
|
||||
# else:
|
||||
# ugroup += ", 0, 0"
|
||||
if len(range16) > 0:
|
||||
PrintRanges("URange16", name+"_range16", range16)
|
||||
ugroup += ", %s_range16, %d" % (name, len(range16))
|
||||
else:
|
||||
ugroup += ", 0, 0"
|
||||
if len(range32) > 0:
|
||||
PrintRanges("URange32", name+"_range32", range32)
|
||||
ugroup += ", %s_range32, %d" % (name, len(range32))
|
||||
else:
|
||||
ugroup += ", 0, 0"
|
||||
ugroup += " }"
|
||||
return ugroup
|
||||
|
||||
def main():
|
||||
print _header
|
||||
ugroups = []
|
||||
for name, codes in unicode.Categories().iteritems():
|
||||
ugroups.append(PrintGroup(name, codes))
|
||||
for name, codes in unicode.Scripts().iteritems():
|
||||
ugroups.append(PrintGroup(name, codes))
|
||||
print "// %d 16-bit ranges, %d 32-bit ranges" % (n16, n32)
|
||||
print "const UGroup unicode_groups[] = {";
|
||||
ugroups.sort()
|
||||
for ug in ugroups:
|
||||
print "\t%s," % (ug,)
|
||||
print "};"
|
||||
print "const int num_unicode_groups = %d;" % (len(ugroups),)
|
||||
print _trailer
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
185
third_party/re2/re2/mimics_pcre.cc
vendored
185
third_party/re2/re2/mimics_pcre.cc
vendored
@ -1,185 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Determine whether this library should match PCRE exactly
|
||||
// for a particular Regexp. (If so, the testing framework can
|
||||
// check that it does.)
|
||||
//
|
||||
// This library matches PCRE except in these cases:
|
||||
// * the regexp contains a repetition of an empty string,
|
||||
// like (a*)* or (a*)+. In this case, PCRE will treat
|
||||
// the repetition sequence as ending with an empty string,
|
||||
// while this library does not.
|
||||
// * Perl and PCRE differ on whether \v matches \n.
|
||||
// For historical reasons, this library implements the Perl behavior.
|
||||
// * Perl and PCRE allow $ in one-line mode to match either the very
|
||||
// end of the text or just before a \n at the end of the text.
|
||||
// This library requires it to match only the end of the text.
|
||||
// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
|
||||
// match the end of the text if the last character is a \n.
|
||||
// This library does allow it.
|
||||
//
|
||||
// Regexp::MimicsPCRE checks for any of these conditions.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Returns whether re might match an empty string.
|
||||
static bool CanBeEmptyString(Regexp *re);
|
||||
|
||||
// Walker class to compute whether library handles a regexp
|
||||
// exactly as PCRE would. See comment at top for conditions.
|
||||
|
||||
class PCREWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
PCREWalker() {}
|
||||
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
|
||||
int nchild_args);
|
||||
|
||||
bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
||||
return a;
|
||||
}
|
||||
};
|
||||
|
||||
// Called after visiting each of re's children and accumulating
|
||||
// the return values in child_args. So child_args contains whether
|
||||
// this library mimics PCRE for those subexpressions.
|
||||
bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
// If children failed, so do we.
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (!child_args[i])
|
||||
return false;
|
||||
|
||||
// Otherwise look for other reasons to fail.
|
||||
switch (re->op()) {
|
||||
// Look for repeated empty string.
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
if (CanBeEmptyString(re->sub()[0]))
|
||||
return false;
|
||||
break;
|
||||
case kRegexpRepeat:
|
||||
if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for \v
|
||||
case kRegexpLiteral:
|
||||
if (re->rune() == '\v')
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for $ in single-line mode.
|
||||
case kRegexpEndText:
|
||||
case kRegexpEmptyMatch:
|
||||
if (re->parse_flags() & Regexp::WasDollar)
|
||||
return false;
|
||||
break;
|
||||
|
||||
// Look for ^ in multi-line mode.
|
||||
case kRegexpBeginLine:
|
||||
// No condition: in single-line mode ^ becomes kRegexpBeginText.
|
||||
return false;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
// Not proven guilty.
|
||||
return true;
|
||||
}
|
||||
|
||||
// Returns whether this regexp's behavior will mimic PCRE's exactly.
|
||||
bool Regexp::MimicsPCRE() {
|
||||
PCREWalker w;
|
||||
return w.Walk(this, true);
|
||||
}
|
||||
|
||||
|
||||
// Walker class to compute whether a Regexp can match an empty string.
|
||||
// It is okay to overestimate. For example, \b\B cannot match an empty
|
||||
// string, because \b and \B are mutually exclusive, but this isn't
|
||||
// that smart and will say it can. Spurious empty strings
|
||||
// will reduce the number of regexps we sanity check against PCRE,
|
||||
// but they won't break anything.
|
||||
|
||||
class EmptyStringWalker : public Regexp::Walker<bool> {
|
||||
public:
|
||||
EmptyStringWalker() { }
|
||||
bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args);
|
||||
|
||||
bool ShortVisit(Regexp* re, bool a) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
||||
return a;
|
||||
}
|
||||
|
||||
private:
|
||||
DISALLOW_COPY_AND_ASSIGN(EmptyStringWalker);
|
||||
};
|
||||
|
||||
// Called after visiting re's children. child_args contains the return
|
||||
// value from each of the children's PostVisits (i.e., whether each child
|
||||
// can match an empty string). Returns whether this clause can match an
|
||||
// empty string.
|
||||
bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
||||
bool* child_args, int nchild_args) {
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch: // never empty
|
||||
case kRegexpLiteral:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpCharClass:
|
||||
case kRegexpLiteralString:
|
||||
return false;
|
||||
|
||||
case kRegexpEmptyMatch: // always empty
|
||||
case kRegexpBeginLine: // always empty, when they match
|
||||
case kRegexpEndLine:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpEndText:
|
||||
case kRegexpStar: // can always be empty
|
||||
case kRegexpQuest:
|
||||
case kRegexpHaveMatch:
|
||||
return true;
|
||||
|
||||
case kRegexpConcat: // can be empty if all children can
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (!child_args[i])
|
||||
return false;
|
||||
return true;
|
||||
|
||||
case kRegexpAlternate: // can be empty if any child can
|
||||
for (int i = 0; i < nchild_args; i++)
|
||||
if (child_args[i])
|
||||
return true;
|
||||
return false;
|
||||
|
||||
case kRegexpPlus: // can be empty if the child can
|
||||
case kRegexpCapture:
|
||||
return child_args[0];
|
||||
|
||||
case kRegexpRepeat: // can be empty if child can or is x{0}
|
||||
return child_args[0] || re->min() == 0;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Returns whether re can match an empty string.
|
||||
static bool CanBeEmptyString(Regexp* re) {
|
||||
EmptyStringWalker w;
|
||||
return w.Walk(re, true);
|
||||
}
|
||||
|
||||
} // namespace re2
|
758
third_party/re2/re2/nfa.cc
vendored
758
third_party/re2/re2/nfa.cc
vendored
@ -1,758 +0,0 @@
|
||||
// Copyright 2006-2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc.
|
||||
//
|
||||
// Prog::SearchNFA, an NFA search.
|
||||
// This is an actual NFA like the theorists talk about,
|
||||
// not the pseudo-NFA found in backtracking regexp implementations.
|
||||
//
|
||||
// IMPLEMENTATION
|
||||
//
|
||||
// This algorithm is a variant of one that appeared in Rob Pike's sam editor,
|
||||
// which is a variant of the one described in Thompson's 1968 CACM paper.
|
||||
// See http://swtch.com/~rsc/regexp/ for various history. The main feature
|
||||
// over the DFA implementation is that it tracks submatch boundaries.
|
||||
//
|
||||
// When the choice of submatch boundaries is ambiguous, this particular
|
||||
// implementation makes the same choices that traditional backtracking
|
||||
// implementations (in particular, Perl and PCRE) do.
|
||||
// Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential
|
||||
// time in the length of the input.
|
||||
//
|
||||
// Like Thompson's original machine and like the DFA implementation, this
|
||||
// implementation notices a match only once it is one byte past it.
|
||||
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "util/sparse_array.h"
|
||||
#include "util/sparse_set.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class NFA {
|
||||
public:
|
||||
NFA(Prog* prog);
|
||||
~NFA();
|
||||
|
||||
// Searches for a matching string.
|
||||
// * If anchored is true, only considers matches starting at offset.
|
||||
// Otherwise finds lefmost match at or after offset.
|
||||
// * If longest is true, returns the longest match starting
|
||||
// at the chosen start point. Otherwise returns the so-called
|
||||
// left-biased match, the one traditional backtracking engines
|
||||
// (like Perl and PCRE) find.
|
||||
// Records submatch boundaries in submatch[1..nsubmatch-1].
|
||||
// Submatch[0] is the entire match. When there is a choice in
|
||||
// which text matches each subexpression, the submatch boundaries
|
||||
// are chosen to match what a backtracking implementation would choose.
|
||||
bool Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch);
|
||||
|
||||
static const int Debug = 0;
|
||||
|
||||
private:
|
||||
struct Thread {
|
||||
union {
|
||||
int id;
|
||||
Thread* next; // when on free list
|
||||
};
|
||||
const char** capture;
|
||||
};
|
||||
|
||||
// State for explicit stack in AddToThreadq.
|
||||
struct AddState {
|
||||
int id; // Inst to process
|
||||
int j;
|
||||
const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip
|
||||
|
||||
AddState()
|
||||
: id(0), j(-1), cap_j(NULL) {}
|
||||
explicit AddState(int id)
|
||||
: id(id), j(-1), cap_j(NULL) {}
|
||||
AddState(int id, const char* cap_j, int j)
|
||||
: id(id), j(j), cap_j(cap_j) {}
|
||||
};
|
||||
|
||||
// Threadq is a list of threads. The list is sorted by the order
|
||||
// in which Perl would explore that particular state -- the earlier
|
||||
// choices appear earlier in the list.
|
||||
typedef SparseArray<Thread*> Threadq;
|
||||
|
||||
inline Thread* AllocThread();
|
||||
inline void FreeThread(Thread*);
|
||||
|
||||
// Add id (or its children, following unlabeled arrows)
|
||||
// to the workqueue q with associated capture info.
|
||||
void AddToThreadq(Threadq* q, int id, int flag,
|
||||
const char* p, const char** capture);
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates matched_ and match_ as new, better matches are found.
|
||||
// p is position of the next byte (the one after c)
|
||||
// in the input string, used when processing capturing parens.
|
||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input point (after c).
|
||||
inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p);
|
||||
|
||||
// Returns text version of capture information, for debugging.
|
||||
string FormatCapture(const char** capture);
|
||||
|
||||
inline void CopyCapture(const char** dst, const char** src);
|
||||
|
||||
// Computes whether all matches must begin with the same first
|
||||
// byte, and if so, returns that byte. If not, returns -1.
|
||||
int ComputeFirstByte();
|
||||
|
||||
Prog* prog_; // underlying program
|
||||
int start_; // start instruction in program
|
||||
int ncapture_; // number of submatches to track
|
||||
bool longest_; // whether searching for longest match
|
||||
bool endmatch_; // whether match must end at text.end()
|
||||
const char* btext_; // beginning of text being matched (for FormatSubmatch)
|
||||
const char* etext_; // end of text being matched (for endmatch_)
|
||||
Threadq q0_, q1_; // pre-allocated for Search.
|
||||
const char** match_; // best match so far
|
||||
bool matched_; // any match so far?
|
||||
AddState* astack_; // pre-allocated for AddToThreadq
|
||||
int nastack_;
|
||||
int first_byte_; // required first byte for match, or -1 if none
|
||||
|
||||
Thread* free_threads_; // free list
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(NFA);
|
||||
};
|
||||
|
||||
NFA::NFA(Prog* prog) {
|
||||
prog_ = prog;
|
||||
start_ = prog->start();
|
||||
ncapture_ = 0;
|
||||
longest_ = false;
|
||||
endmatch_ = false;
|
||||
btext_ = NULL;
|
||||
etext_ = NULL;
|
||||
q0_.resize(prog_->size());
|
||||
q1_.resize(prog_->size());
|
||||
nastack_ = 2*prog_->size();
|
||||
astack_ = new AddState[nastack_];
|
||||
match_ = NULL;
|
||||
matched_ = false;
|
||||
free_threads_ = NULL;
|
||||
first_byte_ = ComputeFirstByte();
|
||||
}
|
||||
|
||||
NFA::~NFA() {
|
||||
delete[] match_;
|
||||
delete[] astack_;
|
||||
Thread* next;
|
||||
for (Thread* t = free_threads_; t; t = next) {
|
||||
next = t->next;
|
||||
delete[] t->capture;
|
||||
delete t;
|
||||
}
|
||||
}
|
||||
|
||||
void NFA::FreeThread(Thread *t) {
|
||||
if (t == NULL)
|
||||
return;
|
||||
t->next = free_threads_;
|
||||
free_threads_ = t;
|
||||
}
|
||||
|
||||
NFA::Thread* NFA::AllocThread() {
|
||||
Thread* t = free_threads_;
|
||||
if (t == NULL) {
|
||||
t = new Thread;
|
||||
t->capture = new const char*[ncapture_];
|
||||
return t;
|
||||
}
|
||||
free_threads_ = t->next;
|
||||
return t;
|
||||
}
|
||||
|
||||
void NFA::CopyCapture(const char** dst, const char** src) {
|
||||
for (int i = 0; i < ncapture_; i+=2) {
|
||||
dst[i] = src[i];
|
||||
dst[i+1] = src[i+1];
|
||||
}
|
||||
}
|
||||
|
||||
// Follows all empty arrows from id0 and enqueues all the states reached.
|
||||
// The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match.
|
||||
// The pointer p is the current input position, and m is the
|
||||
// current set of match boundaries.
|
||||
void NFA::AddToThreadq(Threadq* q, int id0, int flag,
|
||||
const char* p, const char** capture) {
|
||||
if (id0 == 0)
|
||||
return;
|
||||
|
||||
// Astack_ is pre-allocated to avoid resize operations.
|
||||
// It has room for 2*prog_->size() entries, which is enough:
|
||||
// Each inst in prog can be processed at most once,
|
||||
// pushing at most two entries on stk.
|
||||
|
||||
int nstk = 0;
|
||||
AddState* stk = astack_;
|
||||
stk[nstk++] = AddState(id0);
|
||||
|
||||
while (nstk > 0) {
|
||||
DCHECK_LE(nstk, nastack_);
|
||||
const AddState& a = stk[--nstk];
|
||||
if (a.j >= 0)
|
||||
capture[a.j] = a.cap_j;
|
||||
|
||||
int id = a.id;
|
||||
if (id == 0)
|
||||
continue;
|
||||
if (q->has_index(id)) {
|
||||
if (Debug)
|
||||
fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str());
|
||||
continue;
|
||||
}
|
||||
|
||||
// Create entry in q no matter what. We might fill it in below,
|
||||
// or we might not. Even if not, it is necessary to have it,
|
||||
// so that we don't revisit id0 during the recursion.
|
||||
q->set_new(id, NULL);
|
||||
|
||||
Thread** tp = &q->find(id)->second;
|
||||
int j;
|
||||
Thread* t;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq";
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
// Save state; will pick up at next byte.
|
||||
t = AllocThread();
|
||||
t->id = id;
|
||||
CopyCapture(t->capture, capture);
|
||||
*tp = t;
|
||||
// fall through
|
||||
|
||||
case kInstAlt:
|
||||
// Explore alternatives.
|
||||
stk[nstk++] = AddState(ip->out1());
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
|
||||
case kInstNop:
|
||||
// Continue on.
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
if ((j=ip->cap()) < ncapture_) {
|
||||
// Push a dummy whose only job is to restore capture[j]
|
||||
// once we finish exploring this possibility.
|
||||
stk[nstk++] = AddState(0, capture[j], j);
|
||||
|
||||
// Record capture.
|
||||
capture[j] = p;
|
||||
}
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
case kInstByteRange:
|
||||
// Save state; will pick up at next byte.
|
||||
t = AllocThread();
|
||||
t->id = id;
|
||||
CopyCapture(t->capture, capture);
|
||||
*tp = t;
|
||||
if (Debug)
|
||||
fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(), t);
|
||||
break;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
// Continue on if we have all the right flag bits.
|
||||
if (ip->empty() & ~flag)
|
||||
break;
|
||||
stk[nstk++] = AddState(ip->out());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run runq on byte c, appending new states to nextq.
|
||||
// Updates match as new, better matches are found.
|
||||
// p is position of the byte c in the input string,
|
||||
// used when processing capturing parens.
|
||||
// flag is the bitwise or of Bol, Eol, etc., specifying whether
|
||||
// ^, $ and \b match the current input point (after c).
|
||||
// Frees all the threads on runq.
|
||||
// If there is a shortcut to the end, returns that shortcut.
|
||||
int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) {
|
||||
nextq->clear();
|
||||
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
Thread* t = i->second;
|
||||
if (t == NULL)
|
||||
continue;
|
||||
|
||||
if (longest_) {
|
||||
// Can skip any threads started after our current best match.
|
||||
if (matched_ && match_[0] < t->capture[0]) {
|
||||
FreeThread(t);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
int id = t->id;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
// Should only see the values handled below.
|
||||
LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step";
|
||||
break;
|
||||
|
||||
case kInstByteRange:
|
||||
if (ip->Matches(c))
|
||||
AddToThreadq(nextq, ip->out(), flag, p+1, t->capture);
|
||||
break;
|
||||
|
||||
case kInstAltMatch:
|
||||
if (i != runq->begin())
|
||||
break;
|
||||
// The match is ours if we want it.
|
||||
if (ip->greedy(prog_) || longest_) {
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
FreeThread(t);
|
||||
for (++i; i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
runq->clear();
|
||||
matched_ = true;
|
||||
if (ip->greedy(prog_))
|
||||
return ip->out1();
|
||||
return ip->out();
|
||||
}
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
if (endmatch_ && p != etext_)
|
||||
break;
|
||||
|
||||
const char* old = t->capture[1]; // previous end pointer
|
||||
t->capture[1] = p;
|
||||
if (longest_) {
|
||||
// Leftmost-longest mode: save this match only if
|
||||
// it is either farther to the left or at the same
|
||||
// point but longer than an existing match.
|
||||
if (!matched_ || t->capture[0] < match_[0] ||
|
||||
(t->capture[0] == match_[0] && t->capture[1] > match_[1]))
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
} else {
|
||||
// Leftmost-biased mode: this match is by definition
|
||||
// better than what we've already found (see next line).
|
||||
CopyCapture((const char**)match_, t->capture);
|
||||
|
||||
// Cut off the threads that can only find matches
|
||||
// worse than the one we just found: don't run the
|
||||
// rest of the current Threadq.
|
||||
t->capture[0] = old;
|
||||
FreeThread(t);
|
||||
for (++i; i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
runq->clear();
|
||||
matched_ = true;
|
||||
return 0;
|
||||
}
|
||||
t->capture[0] = old;
|
||||
matched_ = true;
|
||||
break;
|
||||
}
|
||||
FreeThread(t);
|
||||
}
|
||||
runq->clear();
|
||||
return 0;
|
||||
}
|
||||
|
||||
string NFA::FormatCapture(const char** capture) {
|
||||
string s;
|
||||
|
||||
for (int i = 0; i < ncapture_; i+=2) {
|
||||
if (capture[i] == NULL)
|
||||
StringAppendF(&s, "(?,?)");
|
||||
else if (capture[i+1] == NULL)
|
||||
StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_));
|
||||
else
|
||||
StringAppendF(&s, "(%d,%d)",
|
||||
(int)(capture[i] - btext_),
|
||||
(int)(capture[i+1] - btext_));
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Returns whether haystack contains needle's memory.
|
||||
static bool StringPieceContains(const StringPiece haystack, const StringPiece needle) {
|
||||
return haystack.begin() <= needle.begin() &&
|
||||
haystack.end() >= needle.end();
|
||||
}
|
||||
|
||||
bool NFA::Search(const StringPiece& text, const StringPiece& const_context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch) {
|
||||
if (start_ == 0)
|
||||
return false;
|
||||
|
||||
StringPiece context = const_context;
|
||||
if (context.begin() == NULL)
|
||||
context = text;
|
||||
|
||||
if (!StringPieceContains(context, text)) {
|
||||
LOG(FATAL) << "Bad args: context does not contain text "
|
||||
<< reinterpret_cast<const void*>(context.begin())
|
||||
<< "+" << context.size() << " "
|
||||
<< reinterpret_cast<const void*>(text.begin())
|
||||
<< "+" << text.size();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (prog_->anchor_start() && context.begin() != text.begin())
|
||||
return false;
|
||||
if (prog_->anchor_end() && context.end() != text.end())
|
||||
return false;
|
||||
anchored |= prog_->anchor_start();
|
||||
if (prog_->anchor_end()) {
|
||||
longest = true;
|
||||
endmatch_ = true;
|
||||
etext_ = text.end();
|
||||
}
|
||||
|
||||
if (nsubmatch < 0) {
|
||||
LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Save search parameters.
|
||||
ncapture_ = 2*nsubmatch;
|
||||
longest_ = longest;
|
||||
|
||||
if (nsubmatch == 0) {
|
||||
// We need to maintain match[0], both to distinguish the
|
||||
// longest match (if longest is true) and also to tell
|
||||
// whether we've seen any matches at all.
|
||||
ncapture_ = 2;
|
||||
}
|
||||
|
||||
match_ = new const char*[ncapture_];
|
||||
matched_ = false;
|
||||
memset(match_, 0, ncapture_*sizeof match_[0]);
|
||||
|
||||
// For debugging prints.
|
||||
btext_ = context.begin();
|
||||
|
||||
if (Debug) {
|
||||
fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n",
|
||||
text.as_string().c_str(), context.as_string().c_str(), anchored,
|
||||
longest);
|
||||
}
|
||||
|
||||
// Set up search.
|
||||
Threadq* runq = &q0_;
|
||||
Threadq* nextq = &q1_;
|
||||
runq->clear();
|
||||
nextq->clear();
|
||||
memset(&match_[0], 0, ncapture_*sizeof match_[0]);
|
||||
const char* bp = context.begin();
|
||||
int c = -1;
|
||||
int wasword = 0;
|
||||
|
||||
if (text.begin() > context.begin()) {
|
||||
c = text.begin()[-1] & 0xFF;
|
||||
wasword = Prog::IsWordChar(static_cast<uint8>(c));
|
||||
}
|
||||
|
||||
// Loop over the text, stepping the machine.
|
||||
for (const char* p = text.begin();; p++) {
|
||||
// Check for empty-width specials.
|
||||
int flag = 0;
|
||||
|
||||
// ^ and \A
|
||||
if (p == context.begin())
|
||||
flag |= kEmptyBeginText | kEmptyBeginLine;
|
||||
else if (p <= context.end() && p[-1] == '\n')
|
||||
flag |= kEmptyBeginLine;
|
||||
|
||||
// $ and \z
|
||||
if (p == context.end())
|
||||
flag |= kEmptyEndText | kEmptyEndLine;
|
||||
else if (p < context.end() && p[0] == '\n')
|
||||
flag |= kEmptyEndLine;
|
||||
|
||||
// \b and \B
|
||||
int isword = 0;
|
||||
if (p < context.end())
|
||||
isword = Prog::IsWordChar(p[0] & 0xFF);
|
||||
|
||||
if (isword != wasword)
|
||||
flag |= kEmptyWordBoundary;
|
||||
else
|
||||
flag |= kEmptyNonWordBoundary;
|
||||
|
||||
if (Debug) {
|
||||
fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c, flag, isword, wasword);
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) {
|
||||
Thread* t = i->second;
|
||||
if (t == NULL)
|
||||
continue;
|
||||
fprintf(stderr, " %d%s", t->id,
|
||||
FormatCapture((const char**)t->capture).c_str());
|
||||
}
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// Process previous character (waited until now to avoid
|
||||
// repeating the flag computation above).
|
||||
// This is a no-op the first time around the loop, because
|
||||
// runq is empty.
|
||||
int id = Step(runq, nextq, c, flag, p-1);
|
||||
DCHECK_EQ(runq->size(), 0);
|
||||
swap(nextq, runq);
|
||||
nextq->clear();
|
||||
if (id != 0) {
|
||||
// We're done: full match ahead.
|
||||
p = text.end();
|
||||
for (;;) {
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode();
|
||||
break;
|
||||
|
||||
case kInstCapture:
|
||||
if (ip->cap() < ncapture_)
|
||||
match_[ip->cap()] = p;
|
||||
id = ip->out();
|
||||
continue;
|
||||
|
||||
case kInstNop:
|
||||
id = ip->out();
|
||||
continue;
|
||||
|
||||
case kInstMatch:
|
||||
match_[1] = p;
|
||||
matched_ = true;
|
||||
break;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) {
|
||||
LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->empty();
|
||||
break;
|
||||
}
|
||||
id = ip->out();
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (p > text.end())
|
||||
break;
|
||||
|
||||
// Start a new thread if there have not been any matches.
|
||||
// (No point in starting a new thread if there have been
|
||||
// matches, since it would be to the right of the match
|
||||
// we already found.)
|
||||
if (!matched_ && (!anchored || p == text.begin())) {
|
||||
// If there's a required first byte for an unanchored search
|
||||
// and we're not in the middle of any possible matches,
|
||||
// use memchr to search for the byte quickly.
|
||||
if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
|
||||
p < text.end() && (p[0] & 0xFF) != first_byte_) {
|
||||
p = reinterpret_cast<const char*>(memchr(p, first_byte_,
|
||||
text.end() - p));
|
||||
if (p == NULL) {
|
||||
p = text.end();
|
||||
isword = 0;
|
||||
} else {
|
||||
isword = Prog::IsWordChar(p[0] & 0xFF);
|
||||
}
|
||||
flag = Prog::EmptyFlags(context, p);
|
||||
}
|
||||
|
||||
// Steal match storage (cleared but unused as of yet)
|
||||
// temporarily to hold match boundaries for new thread.
|
||||
match_[0] = p;
|
||||
AddToThreadq(runq, start_, flag, p, match_);
|
||||
match_[0] = NULL;
|
||||
}
|
||||
|
||||
// If all the threads have died, stop early.
|
||||
if (runq->size() == 0) {
|
||||
if (Debug)
|
||||
fprintf(stderr, "dead\n");
|
||||
break;
|
||||
}
|
||||
|
||||
if (p == text.end())
|
||||
c = 0;
|
||||
else
|
||||
c = *p & 0xFF;
|
||||
wasword = isword;
|
||||
|
||||
// Will run step(runq, nextq, c, ...) on next iteration. See above.
|
||||
}
|
||||
|
||||
for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i)
|
||||
FreeThread(i->second);
|
||||
|
||||
if (matched_) {
|
||||
for (int i = 0; i < nsubmatch; i++)
|
||||
submatch[i].set(match_[2*i],
|
||||
static_cast<int>(match_[2*i+1] - match_[2*i]));
|
||||
if (Debug)
|
||||
fprintf(stderr, "match (%d,%d)\n",
|
||||
static_cast<int>(match_[0] - btext_),
|
||||
static_cast<int>(match_[1] - btext_));
|
||||
return true;
|
||||
}
|
||||
VLOG(1) << "No matches found";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Computes whether all successful matches have a common first byte,
|
||||
// and if so, returns that byte. If not, returns -1.
|
||||
int NFA::ComputeFirstByte() {
|
||||
if (start_ == 0)
|
||||
return -1;
|
||||
|
||||
int b = -1; // first byte, not yet computed
|
||||
|
||||
typedef SparseSet Workq;
|
||||
Workq q(prog_->size());
|
||||
q.insert(start_);
|
||||
for (Workq::iterator it = q.begin(); it != q.end(); ++it) {
|
||||
int id = *it;
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte";
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
// The empty string matches: no first byte.
|
||||
return -1;
|
||||
|
||||
case kInstByteRange:
|
||||
// Must match only a single byte
|
||||
if (ip->lo() != ip->hi())
|
||||
return -1;
|
||||
if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z')
|
||||
return -1;
|
||||
// If we haven't seen any bytes yet, record it;
|
||||
// otherwise must match the one we saw before.
|
||||
if (b == -1)
|
||||
b = ip->lo();
|
||||
else if (b != ip->lo())
|
||||
return -1;
|
||||
break;
|
||||
|
||||
case kInstNop:
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
// Continue on.
|
||||
// Ignore ip->empty() flags for kInstEmptyWidth
|
||||
// in order to be as conservative as possible
|
||||
// (assume all possible empty-width flags are true).
|
||||
if (ip->out())
|
||||
q.insert(ip->out());
|
||||
break;
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
// Explore alternatives.
|
||||
if (ip->out())
|
||||
q.insert(ip->out());
|
||||
if (ip->out1())
|
||||
q.insert(ip->out1());
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
|
||||
bool
|
||||
Prog::SearchNFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch) {
|
||||
if (NFA::Debug)
|
||||
Dump();
|
||||
|
||||
NFA nfa(this);
|
||||
StringPiece sp;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch == 0) {
|
||||
match = &sp;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && match[0].end() != text.end())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// For each instruction i in the program reachable from the start, compute the
|
||||
// number of instructions reachable from i by following only empty transitions
|
||||
// and record that count as fanout[i].
|
||||
//
|
||||
// fanout holds the results and is also the work queue for the outer iteration.
|
||||
// reachable holds the reached nodes for the inner iteration.
|
||||
void Prog::Fanout(SparseArray<int>* fanout) {
|
||||
DCHECK_EQ(fanout->max_size(), size());
|
||||
SparseSet reachable(size());
|
||||
fanout->clear();
|
||||
fanout->set_new(start(), 0);
|
||||
for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i) {
|
||||
int* count = &i->second;
|
||||
reachable.clear();
|
||||
reachable.insert(i->index());
|
||||
for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) {
|
||||
Prog::Inst* ip = inst(*j);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()";
|
||||
break;
|
||||
|
||||
case kInstByteRange:
|
||||
(*count)++;
|
||||
if (!fanout->has_index(ip->out())) {
|
||||
fanout->set_new(ip->out(), 0);
|
||||
}
|
||||
break;
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
reachable.insert(ip->out1());
|
||||
// fall through
|
||||
|
||||
case kInstCapture:
|
||||
case kInstEmptyWidth:
|
||||
case kInstNop:
|
||||
reachable.insert(ip->out());
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
611
third_party/re2/re2/onepass.cc
vendored
611
third_party/re2/re2/onepass.cc
vendored
@ -1,611 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc.
|
||||
//
|
||||
// Prog::SearchOnePass is an efficient implementation of
|
||||
// regular expression search with submatch tracking for
|
||||
// what I call "one-pass regular expressions". (An alternate
|
||||
// name might be "backtracking-free regular expressions".)
|
||||
//
|
||||
// One-pass regular expressions have the property that
|
||||
// at each input byte during an anchored match, there may be
|
||||
// multiple alternatives but only one can proceed for any
|
||||
// given input byte.
|
||||
//
|
||||
// For example, the regexp /x*yx*/ is one-pass: you read
|
||||
// x's until a y, then you read the y, then you keep reading x's.
|
||||
// At no point do you have to guess what to do or back up
|
||||
// and try a different guess.
|
||||
//
|
||||
// On the other hand, /x*x/ is not one-pass: when you're
|
||||
// looking at an input "x", it's not clear whether you should
|
||||
// use it to extend the x* or as the final x.
|
||||
//
|
||||
// More examples: /([^ ]*) (.*)/ is one-pass; /(.*) (.*)/ is not.
|
||||
// /(\d+)-(\d+)/ is one-pass; /(\d+).(\d+)/ is not.
|
||||
//
|
||||
// A simple intuition for identifying one-pass regular expressions
|
||||
// is that it's always immediately obvious when a repetition ends.
|
||||
// It must also be immediately obvious which branch of an | to take:
|
||||
//
|
||||
// /x(y|z)/ is one-pass, but /(xy|xz)/ is not.
|
||||
//
|
||||
// The NFA-based search in nfa.cc does some bookkeeping to
|
||||
// avoid the need for backtracking and its associated exponential blowup.
|
||||
// But if we have a one-pass regular expression, there is no
|
||||
// possibility of backtracking, so there is no need for the
|
||||
// extra bookkeeping. Hence, this code.
|
||||
//
|
||||
// On a one-pass regular expression, the NFA code in nfa.cc
|
||||
// runs at about 1/20 of the backtracking-based PCRE speed.
|
||||
// In contrast, the code in this file runs at about the same
|
||||
// speed as PCRE.
|
||||
//
|
||||
// One-pass regular expressions get used a lot when RE is
|
||||
// used for parsing simple strings, so it pays off to
|
||||
// notice them and handle them efficiently.
|
||||
//
|
||||
// See also Anne Brüggemann-Klein and Derick Wood,
|
||||
// "One-unambiguous regular languages", Information and Computation 142(2).
|
||||
|
||||
#include <string.h>
|
||||
#include <map>
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const int Debug = 0;
|
||||
|
||||
// The key insight behind this implementation is that the
|
||||
// non-determinism in an NFA for a one-pass regular expression
|
||||
// is contained. To explain what that means, first a
|
||||
// refresher about what regular expression programs look like
|
||||
// and how the usual NFA execution runs.
|
||||
//
|
||||
// In a regular expression program, only the kInstByteRange
|
||||
// instruction processes an input byte c and moves on to the
|
||||
// next byte in the string (it does so if c is in the given range).
|
||||
// The kInstByteRange instructions correspond to literal characters
|
||||
// and character classes in the regular expression.
|
||||
//
|
||||
// The kInstAlt instructions are used as wiring to connect the
|
||||
// kInstByteRange instructions together in interesting ways when
|
||||
// implementing | + and *.
|
||||
// The kInstAlt instruction forks execution, like a goto that
|
||||
// jumps to ip->out() and ip->out1() in parallel. Each of the
|
||||
// resulting computation paths is called a thread.
|
||||
//
|
||||
// The other instructions -- kInstEmptyWidth, kInstMatch, kInstCapture --
|
||||
// are interesting in their own right but like kInstAlt they don't
|
||||
// advance the input pointer. Only kInstByteRange does.
|
||||
//
|
||||
// The automaton execution in nfa.cc runs all the possible
|
||||
// threads of execution in lock-step over the input. To process
|
||||
// a particular byte, each thread gets run until it either dies
|
||||
// or finds a kInstByteRange instruction matching the byte.
|
||||
// If the latter happens, the thread stops just past the
|
||||
// kInstByteRange instruction (at ip->out()) and waits for
|
||||
// the other threads to finish processing the input byte.
|
||||
// Then, once all the threads have processed that input byte,
|
||||
// the whole process repeats. The kInstAlt state instruction
|
||||
// might create new threads during input processing, but no
|
||||
// matter what, all the threads stop after a kInstByteRange
|
||||
// and wait for the other threads to "catch up".
|
||||
// Running in lock step like this ensures that the NFA reads
|
||||
// the input string only once.
|
||||
//
|
||||
// Each thread maintains its own set of capture registers
|
||||
// (the string positions at which it executed the kInstCapture
|
||||
// instructions corresponding to capturing parentheses in the
|
||||
// regular expression). Repeated copying of the capture registers
|
||||
// is the main performance bottleneck in the NFA implementation.
|
||||
//
|
||||
// A regular expression program is "one-pass" if, no matter what
|
||||
// the input string, there is only one thread that makes it
|
||||
// past a kInstByteRange instruction at each input byte. This means
|
||||
// that there is in some sense only one active thread throughout
|
||||
// the execution. Other threads might be created during the
|
||||
// processing of an input byte, but they are ephemeral: only one
|
||||
// thread is left to start processing the next input byte.
|
||||
// This is what I meant above when I said the non-determinism
|
||||
// was "contained".
|
||||
//
|
||||
// To execute a one-pass regular expression program, we can build
|
||||
// a DFA (no non-determinism) that has at most as many states as
|
||||
// the NFA (compare this to the possibly exponential number of states
|
||||
// in the general case). Each state records, for each possible
|
||||
// input byte, the next state along with the conditions required
|
||||
// before entering that state -- empty-width flags that must be true
|
||||
// and capture operations that must be performed. It also records
|
||||
// whether a set of conditions required to finish a match at that
|
||||
// point in the input rather than process the next byte.
|
||||
|
||||
// A state in the one-pass NFA - just an array of actions indexed
|
||||
// by the bytemap_[] of the next input byte. (The bytemap
|
||||
// maps next input bytes into equivalence classes, to reduce
|
||||
// the memory footprint.)
|
||||
struct OneState {
|
||||
uint32 matchcond; // conditions to match right now.
|
||||
uint32 action[1];
|
||||
};
|
||||
|
||||
// The uint32 conditions in the action are a combination of
|
||||
// condition and capture bits and the next state. The bottom 16 bits
|
||||
// are the condition and capture bits, and the top 16 are the index of
|
||||
// the next state.
|
||||
//
|
||||
// Bits 0-5 are the empty-width flags from prog.h.
|
||||
// Bit 6 is kMatchWins, which means the match takes
|
||||
// priority over moving to next in a first-match search.
|
||||
// The remaining bits mark capture registers that should
|
||||
// be set to the current input position. The capture bits
|
||||
// start at index 2, since the search loop can take care of
|
||||
// cap[0], cap[1] (the overall match position).
|
||||
// That means we can handle up to 5 capturing parens: $1 through $4, plus $0.
|
||||
// No input position can satisfy both kEmptyWordBoundary
|
||||
// and kEmptyNonWordBoundary, so we can use that as a sentinel
|
||||
// instead of needing an extra bit.
|
||||
|
||||
static const int kIndexShift = 16; // number of bits below index
|
||||
static const int kEmptyShift = 6; // number of empty flags in prog.h
|
||||
static const int kRealCapShift = kEmptyShift + 1;
|
||||
static const int kRealMaxCap = (kIndexShift - kRealCapShift) / 2 * 2;
|
||||
|
||||
// Parameters used to skip over cap[0], cap[1].
|
||||
static const int kCapShift = kRealCapShift - 2;
|
||||
static const int kMaxCap = kRealMaxCap + 2;
|
||||
|
||||
static const uint32 kMatchWins = 1 << kEmptyShift;
|
||||
static const uint32 kCapMask = ((1 << kRealMaxCap) - 1) << kRealCapShift;
|
||||
|
||||
static const uint32 kImpossible = kEmptyWordBoundary | kEmptyNonWordBoundary;
|
||||
|
||||
// Check, at compile time, that prog.h agrees with math above.
|
||||
// This function is never called.
|
||||
void OnePass_Checks() {
|
||||
COMPILE_ASSERT((1<<kEmptyShift)-1 == kEmptyAllFlags,
|
||||
kEmptyShift_disagrees_with_kEmptyAllFlags);
|
||||
// kMaxCap counts pointers, kMaxOnePassCapture counts pairs.
|
||||
COMPILE_ASSERT(kMaxCap == Prog::kMaxOnePassCapture*2,
|
||||
kMaxCap_disagrees_with_kMaxOnePassCapture);
|
||||
}
|
||||
|
||||
static bool Satisfy(uint32 cond, const StringPiece& context, const char* p) {
|
||||
uint32 satisfied = Prog::EmptyFlags(context, p);
|
||||
if (cond & kEmptyAllFlags & ~satisfied)
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Apply the capture bits in cond, saving p to the appropriate
|
||||
// locations in cap[].
|
||||
static void ApplyCaptures(uint32 cond, const char* p,
|
||||
const char** cap, int ncap) {
|
||||
for (int i = 2; i < ncap; i++)
|
||||
if (cond & (1 << kCapShift << i))
|
||||
cap[i] = p;
|
||||
}
|
||||
|
||||
// Compute a node pointer.
|
||||
// Basically (OneState*)(nodes + statesize*nodeindex)
|
||||
// but the version with the C++ casts overflows 80 characters (and is ugly).
|
||||
static inline OneState* IndexToNode(volatile uint8* nodes, int statesize,
|
||||
int nodeindex) {
|
||||
return reinterpret_cast<OneState*>(
|
||||
const_cast<uint8*>(nodes + statesize*nodeindex));
|
||||
}
|
||||
|
||||
bool Prog::SearchOnePass(const StringPiece& text,
|
||||
const StringPiece& const_context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch) {
|
||||
if (anchor != kAnchored && kind != kFullMatch) {
|
||||
LOG(DFATAL) << "Cannot use SearchOnePass for unanchored matches.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Make sure we have at least cap[1],
|
||||
// because we use it to tell if we matched.
|
||||
int ncap = 2*nmatch;
|
||||
if (ncap < 2)
|
||||
ncap = 2;
|
||||
|
||||
const char* cap[kMaxCap];
|
||||
for (int i = 0; i < ncap; i++)
|
||||
cap[i] = NULL;
|
||||
|
||||
const char* matchcap[kMaxCap];
|
||||
for (int i = 0; i < ncap; i++)
|
||||
matchcap[i] = NULL;
|
||||
|
||||
StringPiece context = const_context;
|
||||
if (context.begin() == NULL)
|
||||
context = text;
|
||||
if (anchor_start() && context.begin() != text.begin())
|
||||
return false;
|
||||
if (anchor_end() && context.end() != text.end())
|
||||
return false;
|
||||
if (anchor_end())
|
||||
kind = kFullMatch;
|
||||
|
||||
// State and act are marked volatile to
|
||||
// keep the compiler from re-ordering the
|
||||
// memory accesses walking over the NFA.
|
||||
// This is worth about 5%.
|
||||
volatile OneState* state = onepass_start_;
|
||||
volatile uint8* nodes = onepass_nodes_;
|
||||
volatile uint32 statesize = onepass_statesize_;
|
||||
uint8* bytemap = bytemap_;
|
||||
const char* bp = text.begin();
|
||||
const char* ep = text.end();
|
||||
const char* p;
|
||||
bool matched = false;
|
||||
matchcap[0] = bp;
|
||||
cap[0] = bp;
|
||||
uint32 nextmatchcond = state->matchcond;
|
||||
for (p = bp; p < ep; p++) {
|
||||
int c = bytemap[*p & 0xFF];
|
||||
uint32 matchcond = nextmatchcond;
|
||||
uint32 cond = state->action[c];
|
||||
|
||||
// Determine whether we can reach act->next.
|
||||
// If so, advance state and nextmatchcond.
|
||||
if ((cond & kEmptyAllFlags) == 0 || Satisfy(cond, context, p)) {
|
||||
uint32 nextindex = cond >> kIndexShift;
|
||||
state = IndexToNode(nodes, statesize, nextindex);
|
||||
nextmatchcond = state->matchcond;
|
||||
} else {
|
||||
state = NULL;
|
||||
nextmatchcond = kImpossible;
|
||||
}
|
||||
|
||||
// This code section is carefully tuned.
|
||||
// The goto sequence is about 10% faster than the
|
||||
// obvious rewrite as a large if statement in the
|
||||
// ASCIIMatchRE2 and DotMatchRE2 benchmarks.
|
||||
|
||||
// Saving the match capture registers is expensive.
|
||||
// Is this intermediate match worth thinking about?
|
||||
|
||||
// Not if we want a full match.
|
||||
if (kind == kFullMatch)
|
||||
goto skipmatch;
|
||||
|
||||
// Not if it's impossible.
|
||||
if (matchcond == kImpossible)
|
||||
goto skipmatch;
|
||||
|
||||
// Not if the possible match is beaten by the certain
|
||||
// match at the next byte. When this test is useless
|
||||
// (e.g., HTTPPartialMatchRE2) it slows the loop by
|
||||
// about 10%, but when it avoids work (e.g., DotMatchRE2),
|
||||
// it cuts the loop execution by about 45%.
|
||||
if ((cond & kMatchWins) == 0 && (nextmatchcond & kEmptyAllFlags) == 0)
|
||||
goto skipmatch;
|
||||
|
||||
// Finally, the match conditions must be satisfied.
|
||||
if ((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p)) {
|
||||
for (int i = 2; i < 2*nmatch; i++)
|
||||
matchcap[i] = cap[i];
|
||||
if (nmatch > 1 && (matchcond & kCapMask))
|
||||
ApplyCaptures(matchcond, p, matchcap, ncap);
|
||||
matchcap[1] = p;
|
||||
matched = true;
|
||||
|
||||
// If we're in longest match mode, we have to keep
|
||||
// going and see if we find a longer match.
|
||||
// In first match mode, we can stop if the match
|
||||
// takes priority over the next state for this input byte.
|
||||
// That bit is per-input byte and thus in cond, not matchcond.
|
||||
if (kind == kFirstMatch && (cond & kMatchWins))
|
||||
goto done;
|
||||
}
|
||||
|
||||
skipmatch:
|
||||
if (state == NULL)
|
||||
goto done;
|
||||
if ((cond & kCapMask) && nmatch > 1)
|
||||
ApplyCaptures(cond, p, cap, ncap);
|
||||
}
|
||||
|
||||
// Look for match at end of input.
|
||||
{
|
||||
uint32 matchcond = state->matchcond;
|
||||
if (matchcond != kImpossible &&
|
||||
((matchcond & kEmptyAllFlags) == 0 || Satisfy(matchcond, context, p))) {
|
||||
if (nmatch > 1 && (matchcond & kCapMask))
|
||||
ApplyCaptures(matchcond, p, cap, ncap);
|
||||
for (int i = 2; i < ncap; i++)
|
||||
matchcap[i] = cap[i];
|
||||
matchcap[1] = p;
|
||||
matched = true;
|
||||
}
|
||||
}
|
||||
|
||||
done:
|
||||
if (!matched)
|
||||
return false;
|
||||
for (int i = 0; i < nmatch; i++)
|
||||
match[i].set(matchcap[2*i],
|
||||
static_cast<int>(matchcap[2*i+1] - matchcap[2*i]));
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
// Analysis to determine whether a given regexp program is one-pass.
|
||||
|
||||
// If ip is not on workq, adds ip to work queue and returns true.
|
||||
// If ip is already on work queue, does nothing and returns false.
|
||||
// If ip is NULL, does nothing and returns true (pretends to add it).
|
||||
typedef SparseSet Instq;
|
||||
static bool AddQ(Instq *q, int id) {
|
||||
if (id == 0)
|
||||
return true;
|
||||
if (q->contains(id))
|
||||
return false;
|
||||
q->insert(id);
|
||||
return true;
|
||||
}
|
||||
|
||||
struct InstCond {
|
||||
int id;
|
||||
uint32 cond;
|
||||
};
|
||||
|
||||
// Returns whether this is a one-pass program; that is,
|
||||
// returns whether it is safe to use SearchOnePass on this program.
|
||||
// These conditions must be true for any instruction ip:
|
||||
//
|
||||
// (1) for any other Inst nip, there is at most one input-free
|
||||
// path from ip to nip.
|
||||
// (2) there is at most one kInstByte instruction reachable from
|
||||
// ip that matches any particular byte c.
|
||||
// (3) there is at most one input-free path from ip to a kInstMatch
|
||||
// instruction.
|
||||
//
|
||||
// This is actually just a conservative approximation: it might
|
||||
// return false when the answer is true, when kInstEmptyWidth
|
||||
// instructions are involved.
|
||||
// Constructs and saves corresponding one-pass NFA on success.
|
||||
bool Prog::IsOnePass() {
|
||||
if (did_onepass_)
|
||||
return onepass_start_ != NULL;
|
||||
did_onepass_ = true;
|
||||
|
||||
if (start() == 0) // no match
|
||||
return false;
|
||||
|
||||
// Steal memory for the one-pass NFA from the overall DFA budget.
|
||||
// Willing to use at most 1/4 of the DFA budget (heuristic).
|
||||
// Limit max node count to 65000 as a conservative estimate to
|
||||
// avoid overflowing 16-bit node index in encoding.
|
||||
int maxnodes = 2 + byte_inst_count_;
|
||||
int statesize = sizeof(OneState) + (bytemap_range_-1)*sizeof(uint32);
|
||||
if (maxnodes >= 65000 || dfa_mem_ / 4 / statesize < maxnodes)
|
||||
return false;
|
||||
|
||||
// Flood the graph starting at the start state, and check
|
||||
// that in each reachable state, each possible byte leads
|
||||
// to a unique next state.
|
||||
int size = this->size();
|
||||
InstCond *stack = new InstCond[size];
|
||||
|
||||
int* nodebyid = new int[size]; // indexed by ip
|
||||
memset(nodebyid, 0xFF, size*sizeof nodebyid[0]);
|
||||
|
||||
uint8* nodes = new uint8[maxnodes*statesize];
|
||||
uint8* nodep = nodes;
|
||||
|
||||
Instq tovisit(size), workq(size);
|
||||
AddQ(&tovisit, start());
|
||||
nodebyid[start()] = 0;
|
||||
nodep += statesize;
|
||||
int nalloc = 1;
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
||||
|
||||
// Flood graph using manual stack, filling in actions as found.
|
||||
// Default is none.
|
||||
for (int b = 0; b < bytemap_range_; b++)
|
||||
node->action[b] = kImpossible;
|
||||
node->matchcond = kImpossible;
|
||||
|
||||
workq.clear();
|
||||
bool matched = false;
|
||||
int nstack = 0;
|
||||
stack[nstack].id = id;
|
||||
stack[nstack++].cond = 0;
|
||||
while (nstack > 0) {
|
||||
int id = stack[--nstack].id;
|
||||
Prog::Inst* ip = inst(id);
|
||||
uint32 cond = stack[nstack].cond;
|
||||
switch (ip->opcode()) {
|
||||
case kInstAltMatch:
|
||||
// TODO(rsc): Ignoring kInstAltMatch optimization.
|
||||
// Should implement it in this engine, but it's subtle.
|
||||
// Fall through.
|
||||
case kInstAlt:
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, ip->out()) || !AddQ(&workq, ip->out1()))
|
||||
goto fail;
|
||||
stack[nstack].id = ip->out1();
|
||||
stack[nstack++].cond = cond;
|
||||
stack[nstack].id = ip->out();
|
||||
stack[nstack++].cond = cond;
|
||||
break;
|
||||
|
||||
case kInstByteRange: {
|
||||
int nextindex = nodebyid[ip->out()];
|
||||
if (nextindex == -1) {
|
||||
if (nalloc >= maxnodes) {
|
||||
if (Debug)
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: hit node limit %d > %d",
|
||||
nalloc, maxnodes);
|
||||
goto fail;
|
||||
}
|
||||
nextindex = nalloc;
|
||||
nodep += statesize;
|
||||
nodebyid[ip->out()] = nextindex;
|
||||
nalloc++;
|
||||
AddQ(&tovisit, ip->out());
|
||||
}
|
||||
if (matched)
|
||||
cond |= kMatchWins;
|
||||
for (int c = ip->lo(); c <= ip->hi(); c++) {
|
||||
int b = bytemap_[c];
|
||||
c = unbytemap_[b]; // last c in byte class
|
||||
uint32 act = node->action[b];
|
||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (Debug) {
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: conflict on byte "
|
||||
"%#x at state %d",
|
||||
c, *it);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
if (ip->foldcase()) {
|
||||
Rune lo = max<Rune>(ip->lo(), 'a') + 'A' - 'a';
|
||||
Rune hi = min<Rune>(ip->hi(), 'z') + 'A' - 'a';
|
||||
for (int c = lo; c <= hi; c++) {
|
||||
int b = bytemap_[c];
|
||||
c = unbytemap_[b]; // last c in class
|
||||
uint32 act = node->action[b];
|
||||
uint32 newact = (nextindex << kIndexShift) | cond;
|
||||
if ((act & kImpossible) == kImpossible) {
|
||||
node->action[b] = newact;
|
||||
} else if (act != newact) {
|
||||
if (Debug) {
|
||||
LOG(ERROR)
|
||||
<< StringPrintf("Not OnePass: conflict on byte "
|
||||
"%#x at state %d",
|
||||
c, *it);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case kInstCapture:
|
||||
if (ip->cap() < kMaxCap)
|
||||
cond |= (1 << kCapShift) << ip->cap();
|
||||
goto QueueEmpty;
|
||||
|
||||
case kInstEmptyWidth:
|
||||
cond |= ip->empty();
|
||||
goto QueueEmpty;
|
||||
|
||||
case kInstNop:
|
||||
QueueEmpty:
|
||||
// kInstCapture and kInstNop always proceed to ip->out().
|
||||
// kInstEmptyWidth only sometimes proceeds to ip->out(),
|
||||
// but as a conservative approximation we assume it always does.
|
||||
// We could be a little more precise by looking at what c
|
||||
// is, but that seems like overkill.
|
||||
|
||||
// If already on work queue, (1) is violated: bail out.
|
||||
if (!AddQ(&workq, ip->out())) {
|
||||
if (Debug) {
|
||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple paths"
|
||||
" %d -> %d\n",
|
||||
*it, ip->out());
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
stack[nstack].id = ip->out();
|
||||
stack[nstack++].cond = cond;
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
if (matched) {
|
||||
// (3) is violated
|
||||
if (Debug) {
|
||||
LOG(ERROR) << StringPrintf("Not OnePass: multiple matches"
|
||||
" from %d\n", *it);
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
matched = true;
|
||||
node->matchcond = cond;
|
||||
break;
|
||||
|
||||
case kInstFail:
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Debug) { // For debugging, dump one-pass NFA to LOG(ERROR).
|
||||
string dump = "prog dump:\n" + Dump() + "node dump\n";
|
||||
map<int, int> idmap;
|
||||
for (int i = 0; i < size; i++)
|
||||
if (nodebyid[i] != -1)
|
||||
idmap[nodebyid[i]] = i;
|
||||
|
||||
StringAppendF(&dump, "byte ranges:\n");
|
||||
int i = 0;
|
||||
for (int b = 0; b < bytemap_range_; b++) {
|
||||
int lo = i;
|
||||
while (bytemap_[i] == b)
|
||||
i++;
|
||||
StringAppendF(&dump, "\t%d: %#x-%#x\n", b, lo, i - 1);
|
||||
}
|
||||
|
||||
for (Instq::iterator it = tovisit.begin(); it != tovisit.end(); ++it) {
|
||||
int id = *it;
|
||||
int nodeindex = nodebyid[id];
|
||||
if (nodeindex == -1)
|
||||
continue;
|
||||
OneState* node = IndexToNode(nodes, statesize, nodeindex);
|
||||
string s;
|
||||
StringAppendF(&dump, "node %d id=%d: matchcond=%#x\n",
|
||||
nodeindex, id, node->matchcond);
|
||||
for (int i = 0; i < bytemap_range_; i++) {
|
||||
if ((node->action[i] & kImpossible) == kImpossible)
|
||||
continue;
|
||||
StringAppendF(&dump, " %d cond %#x -> %d id=%d\n",
|
||||
i, node->action[i] & 0xFFFF,
|
||||
node->action[i] >> kIndexShift,
|
||||
idmap[node->action[i] >> kIndexShift]);
|
||||
}
|
||||
}
|
||||
LOG(ERROR) << dump;
|
||||
}
|
||||
|
||||
// Overallocated earlier; cut down to actual size.
|
||||
nodep = new uint8[nalloc*statesize];
|
||||
memmove(nodep, nodes, nalloc*statesize);
|
||||
delete[] nodes;
|
||||
nodes = nodep;
|
||||
|
||||
onepass_start_ = IndexToNode(nodes, statesize, nodebyid[start()]);
|
||||
onepass_nodes_ = nodes;
|
||||
onepass_statesize_ = statesize;
|
||||
dfa_mem_ -= nalloc*statesize;
|
||||
|
||||
delete[] stack;
|
||||
delete[] nodebyid;
|
||||
return true;
|
||||
|
||||
fail:
|
||||
delete[] stack;
|
||||
delete[] nodebyid;
|
||||
delete[] nodes;
|
||||
return false;
|
||||
}
|
||||
|
||||
} // namespace re2
|
2270
third_party/re2/re2/parse.cc
vendored
2270
third_party/re2/re2/parse.cc
vendored
File diff suppressed because it is too large
Load Diff
119
third_party/re2/re2/perl_groups.cc
vendored
119
third_party/re2/re2/perl_groups.cc
vendored
@ -1,119 +0,0 @@
|
||||
// GENERATED BY make_perl_groups.pl; DO NOT EDIT.
|
||||
// make_perl_groups.pl >perl_groups.cc
|
||||
|
||||
#include "re2/unicode_groups.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const URange16 code1[] = { /* \d */
|
||||
{ 0x30, 0x39 },
|
||||
};
|
||||
static const URange16 code2[] = { /* \s */
|
||||
{ 0x9, 0xa },
|
||||
{ 0xc, 0xd },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static const URange16 code3[] = { /* \w */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x5f, 0x5f },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
const UGroup perl_groups[] = {
|
||||
{ "\\d", +1, code1, 1 },
|
||||
{ "\\D", -1, code1, 1 },
|
||||
{ "\\s", +1, code2, 3 },
|
||||
{ "\\S", -1, code2, 3 },
|
||||
{ "\\w", +1, code3, 4 },
|
||||
{ "\\W", -1, code3, 4 },
|
||||
};
|
||||
const int num_perl_groups = 6;
|
||||
static const URange16 code4[] = { /* [:alnum:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code5[] = { /* [:alpha:] */
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code6[] = { /* [:ascii:] */
|
||||
{ 0x0, 0x7f },
|
||||
};
|
||||
static const URange16 code7[] = { /* [:blank:] */
|
||||
{ 0x9, 0x9 },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static const URange16 code8[] = { /* [:cntrl:] */
|
||||
{ 0x0, 0x1f },
|
||||
{ 0x7f, 0x7f },
|
||||
};
|
||||
static const URange16 code9[] = { /* [:digit:] */
|
||||
{ 0x30, 0x39 },
|
||||
};
|
||||
static const URange16 code10[] = { /* [:graph:] */
|
||||
{ 0x21, 0x7e },
|
||||
};
|
||||
static const URange16 code11[] = { /* [:lower:] */
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code12[] = { /* [:print:] */
|
||||
{ 0x20, 0x7e },
|
||||
};
|
||||
static const URange16 code13[] = { /* [:punct:] */
|
||||
{ 0x21, 0x2f },
|
||||
{ 0x3a, 0x40 },
|
||||
{ 0x5b, 0x60 },
|
||||
{ 0x7b, 0x7e },
|
||||
};
|
||||
static const URange16 code14[] = { /* [:space:] */
|
||||
{ 0x9, 0xd },
|
||||
{ 0x20, 0x20 },
|
||||
};
|
||||
static const URange16 code15[] = { /* [:upper:] */
|
||||
{ 0x41, 0x5a },
|
||||
};
|
||||
static const URange16 code16[] = { /* [:word:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x5a },
|
||||
{ 0x5f, 0x5f },
|
||||
{ 0x61, 0x7a },
|
||||
};
|
||||
static const URange16 code17[] = { /* [:xdigit:] */
|
||||
{ 0x30, 0x39 },
|
||||
{ 0x41, 0x46 },
|
||||
{ 0x61, 0x66 },
|
||||
};
|
||||
const UGroup posix_groups[] = {
|
||||
{ "[:alnum:]", +1, code4, 3 },
|
||||
{ "[:^alnum:]", -1, code4, 3 },
|
||||
{ "[:alpha:]", +1, code5, 2 },
|
||||
{ "[:^alpha:]", -1, code5, 2 },
|
||||
{ "[:ascii:]", +1, code6, 1 },
|
||||
{ "[:^ascii:]", -1, code6, 1 },
|
||||
{ "[:blank:]", +1, code7, 2 },
|
||||
{ "[:^blank:]", -1, code7, 2 },
|
||||
{ "[:cntrl:]", +1, code8, 2 },
|
||||
{ "[:^cntrl:]", -1, code8, 2 },
|
||||
{ "[:digit:]", +1, code9, 1 },
|
||||
{ "[:^digit:]", -1, code9, 1 },
|
||||
{ "[:graph:]", +1, code10, 1 },
|
||||
{ "[:^graph:]", -1, code10, 1 },
|
||||
{ "[:lower:]", +1, code11, 1 },
|
||||
{ "[:^lower:]", -1, code11, 1 },
|
||||
{ "[:print:]", +1, code12, 1 },
|
||||
{ "[:^print:]", -1, code12, 1 },
|
||||
{ "[:punct:]", +1, code13, 4 },
|
||||
{ "[:^punct:]", -1, code13, 4 },
|
||||
{ "[:space:]", +1, code14, 2 },
|
||||
{ "[:^space:]", -1, code14, 2 },
|
||||
{ "[:upper:]", +1, code15, 1 },
|
||||
{ "[:^upper:]", -1, code15, 1 },
|
||||
{ "[:word:]", +1, code16, 4 },
|
||||
{ "[:^word:]", -1, code16, 4 },
|
||||
{ "[:xdigit:]", +1, code17, 3 },
|
||||
{ "[:^xdigit:]", -1, code17, 3 },
|
||||
};
|
||||
const int num_posix_groups = 28;
|
||||
|
||||
} // namespace re2
|
709
third_party/re2/re2/prefilter.cc
vendored
709
third_party/re2/re2/prefilter.cc
vendored
@ -1,709 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/unicode_casefold.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const int Trace = false;
|
||||
|
||||
typedef set<string>::iterator SSIter;
|
||||
typedef set<string>::const_iterator ConstSSIter;
|
||||
|
||||
GLOBAL_MUTEX(alloc_id_mutex);
|
||||
static int alloc_id = 100000; // Used for debugging.
|
||||
// Initializes a Prefilter, allocating subs_ as necessary.
|
||||
Prefilter::Prefilter(Op op) {
|
||||
op_ = op;
|
||||
subs_ = NULL;
|
||||
if (op_ == AND || op_ == OR)
|
||||
subs_ = new vector<Prefilter*>;
|
||||
|
||||
GLOBAL_MUTEX_LOCK(alloc_id_mutex);
|
||||
alloc_id_ = alloc_id++;
|
||||
GLOBAL_MUTEX_UNLOCK(alloc_id_mutex);
|
||||
VLOG(10) << "alloc_id: " << alloc_id_;
|
||||
}
|
||||
|
||||
// Destroys a Prefilter.
|
||||
Prefilter::~Prefilter() {
|
||||
VLOG(10) << "Deleted: " << alloc_id_;
|
||||
if (subs_) {
|
||||
for (size_t i = 0; i < subs_->size(); i++)
|
||||
delete (*subs_)[i];
|
||||
delete subs_;
|
||||
subs_ = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// Simplify if the node is an empty Or or And.
|
||||
Prefilter* Prefilter::Simplify() {
|
||||
if (op_ != AND && op_ != OR) {
|
||||
return this;
|
||||
}
|
||||
|
||||
// Nothing left in the AND/OR.
|
||||
if (subs_->size() == 0) {
|
||||
if (op_ == AND)
|
||||
op_ = ALL; // AND of nothing is true
|
||||
else
|
||||
op_ = NONE; // OR of nothing is false
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
// Just one subnode: throw away wrapper.
|
||||
if (subs_->size() == 1) {
|
||||
Prefilter* a = (*subs_)[0];
|
||||
subs_->clear();
|
||||
delete this;
|
||||
return a->Simplify();
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
// Combines two Prefilters together to create an "op" (AND or OR).
|
||||
// The passed Prefilters will be part of the returned Prefilter or deleted.
|
||||
// Does lots of work to avoid creating unnecessarily complicated structures.
|
||||
Prefilter* Prefilter::AndOr(Op op, Prefilter* a, Prefilter* b) {
|
||||
// If a, b can be rewritten as op, do so.
|
||||
a = a->Simplify();
|
||||
b = b->Simplify();
|
||||
|
||||
// Canonicalize: a->op <= b->op.
|
||||
if (a->op() > b->op()) {
|
||||
Prefilter* t = a;
|
||||
a = b;
|
||||
b = t;
|
||||
}
|
||||
|
||||
// Trivial cases.
|
||||
// ALL AND b = b
|
||||
// NONE OR b = b
|
||||
// ALL OR b = ALL
|
||||
// NONE AND b = NONE
|
||||
// Don't need to look at b, because of canonicalization above.
|
||||
// ALL and NONE are smallest opcodes.
|
||||
if (a->op() == ALL || a->op() == NONE) {
|
||||
if ((a->op() == ALL && op == AND) ||
|
||||
(a->op() == NONE && op == OR)) {
|
||||
delete a;
|
||||
return b;
|
||||
} else {
|
||||
delete b;
|
||||
return a;
|
||||
}
|
||||
}
|
||||
|
||||
// If a and b match op, merge their contents.
|
||||
if (a->op() == op && b->op() == op) {
|
||||
for (size_t i = 0; i < b->subs()->size(); i++) {
|
||||
Prefilter* bb = (*b->subs())[i];
|
||||
a->subs()->push_back(bb);
|
||||
}
|
||||
b->subs()->clear();
|
||||
delete b;
|
||||
return a;
|
||||
}
|
||||
|
||||
// If a already has the same op as the op that is under construction
|
||||
// add in b (similarly if b already has the same op, add in a).
|
||||
if (b->op() == op) {
|
||||
Prefilter* t = a;
|
||||
a = b;
|
||||
b = t;
|
||||
}
|
||||
if (a->op() == op) {
|
||||
a->subs()->push_back(b);
|
||||
return a;
|
||||
}
|
||||
|
||||
// Otherwise just return the op.
|
||||
Prefilter* c = new Prefilter(op);
|
||||
c->subs()->push_back(a);
|
||||
c->subs()->push_back(b);
|
||||
return c;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::And(Prefilter* a, Prefilter* b) {
|
||||
return AndOr(AND, a, b);
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::Or(Prefilter* a, Prefilter* b) {
|
||||
return AndOr(OR, a, b);
|
||||
}
|
||||
|
||||
static void SimplifyStringSet(set<string> *ss) {
|
||||
// Now make sure that the strings aren't redundant. For example, if
|
||||
// we know "ab" is a required string, then it doesn't help at all to
|
||||
// know that "abc" is also a required string, so delete "abc". This
|
||||
// is because, when we are performing a string search to filter
|
||||
// regexps, matching ab will already allow this regexp to be a
|
||||
// candidate for match, so further matching abc is redundant.
|
||||
|
||||
for (SSIter i = ss->begin(); i != ss->end(); ++i) {
|
||||
SSIter j = i;
|
||||
++j;
|
||||
while (j != ss->end()) {
|
||||
// Increment j early so that we can erase the element it points to.
|
||||
SSIter old_j = j;
|
||||
++j;
|
||||
if (old_j->find(*i) != string::npos)
|
||||
ss->erase(old_j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::OrStrings(set<string>* ss) {
|
||||
SimplifyStringSet(ss);
|
||||
Prefilter* or_prefilter = NULL;
|
||||
if (!ss->empty()) {
|
||||
or_prefilter = new Prefilter(NONE);
|
||||
for (SSIter i = ss->begin(); i != ss->end(); ++i)
|
||||
or_prefilter = Or(or_prefilter, FromString(*i));
|
||||
}
|
||||
return or_prefilter;
|
||||
}
|
||||
|
||||
static Rune ToLowerRune(Rune r) {
|
||||
if (r < Runeself) {
|
||||
if ('A' <= r && r <= 'Z')
|
||||
r += 'a' - 'A';
|
||||
return r;
|
||||
}
|
||||
|
||||
const CaseFold *f = LookupCaseFold(unicode_tolower, num_unicode_tolower, r);
|
||||
if (f == NULL || r < f->lo)
|
||||
return r;
|
||||
return ApplyFold(f, r);
|
||||
}
|
||||
|
||||
static Rune ToLowerRuneLatin1(Rune r) {
|
||||
if ('A' <= r && r <= 'Z')
|
||||
r += 'a' - 'A';
|
||||
return r;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::FromString(const string& str) {
|
||||
Prefilter* m = new Prefilter(Prefilter::ATOM);
|
||||
m->atom_ = str;
|
||||
return m;
|
||||
}
|
||||
|
||||
// Information about a regexp used during computation of Prefilter.
|
||||
// Can be thought of as information about the set of strings matching
|
||||
// the given regular expression.
|
||||
class Prefilter::Info {
|
||||
public:
|
||||
Info();
|
||||
~Info();
|
||||
|
||||
// More constructors. They delete their Info* arguments.
|
||||
static Info* Alt(Info* a, Info* b);
|
||||
static Info* Concat(Info* a, Info* b);
|
||||
static Info* And(Info* a, Info* b);
|
||||
static Info* Star(Info* a);
|
||||
static Info* Plus(Info* a);
|
||||
static Info* Quest(Info* a);
|
||||
static Info* EmptyString();
|
||||
static Info* NoMatch();
|
||||
static Info* AnyChar();
|
||||
static Info* CClass(CharClass* cc, bool latin1);
|
||||
static Info* Literal(Rune r);
|
||||
static Info* LiteralLatin1(Rune r);
|
||||
static Info* AnyMatch();
|
||||
|
||||
// Format Info as a string.
|
||||
string ToString();
|
||||
|
||||
// Caller takes ownership of the Prefilter.
|
||||
Prefilter* TakeMatch();
|
||||
|
||||
set<string>& exact() { return exact_; }
|
||||
|
||||
bool is_exact() const { return is_exact_; }
|
||||
|
||||
class Walker;
|
||||
|
||||
private:
|
||||
set<string> exact_;
|
||||
|
||||
// When is_exact_ is true, the strings that match
|
||||
// are placed in exact_. When it is no longer an exact
|
||||
// set of strings that match this RE, then is_exact_
|
||||
// is false and the match_ contains the required match
|
||||
// criteria.
|
||||
bool is_exact_;
|
||||
|
||||
// Accumulated Prefilter query that any
|
||||
// match for this regexp is guaranteed to match.
|
||||
Prefilter* match_;
|
||||
};
|
||||
|
||||
|
||||
Prefilter::Info::Info()
|
||||
: is_exact_(false),
|
||||
match_(NULL) {
|
||||
}
|
||||
|
||||
Prefilter::Info::~Info() {
|
||||
delete match_;
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::Info::TakeMatch() {
|
||||
if (is_exact_) {
|
||||
match_ = Prefilter::OrStrings(&exact_);
|
||||
is_exact_ = false;
|
||||
}
|
||||
Prefilter* m = match_;
|
||||
match_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
// Format a Info in string form.
|
||||
string Prefilter::Info::ToString() {
|
||||
if (is_exact_) {
|
||||
int n = 0;
|
||||
string s;
|
||||
for (set<string>::iterator i = exact_.begin(); i != exact_.end(); ++i) {
|
||||
if (n++ > 0)
|
||||
s += ",";
|
||||
s += *i;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
if (match_)
|
||||
return match_->DebugString();
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
// Add the strings from src to dst.
|
||||
static void CopyIn(const set<string>& src, set<string>* dst) {
|
||||
for (ConstSSIter i = src.begin(); i != src.end(); ++i)
|
||||
dst->insert(*i);
|
||||
}
|
||||
|
||||
// Add the cross-product of a and b to dst.
|
||||
// (For each string i in a and j in b, add i+j.)
|
||||
static void CrossProduct(const set<string>& a,
|
||||
const set<string>& b,
|
||||
set<string>* dst) {
|
||||
for (ConstSSIter i = a.begin(); i != a.end(); ++i)
|
||||
for (ConstSSIter j = b.begin(); j != b.end(); ++j)
|
||||
dst->insert(*i + *j);
|
||||
}
|
||||
|
||||
// Concats a and b. Requires that both are exact sets.
|
||||
// Forms an exact set that is a crossproduct of a and b.
|
||||
Prefilter::Info* Prefilter::Info::Concat(Info* a, Info* b) {
|
||||
if (a == NULL)
|
||||
return b;
|
||||
DCHECK(a->is_exact_);
|
||||
DCHECK(b && b->is_exact_);
|
||||
Info *ab = new Info();
|
||||
|
||||
CrossProduct(a->exact_, b->exact_, &ab->exact_);
|
||||
ab->is_exact_ = true;
|
||||
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs an inexact Info for ab given a and b.
|
||||
// Used only when a or b is not exact or when the
|
||||
// exact cross product is likely to be too big.
|
||||
Prefilter::Info* Prefilter::Info::And(Info* a, Info* b) {
|
||||
if (a == NULL)
|
||||
return b;
|
||||
if (b == NULL)
|
||||
return a;
|
||||
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->match_ = Prefilter::And(a->TakeMatch(), b->TakeMatch());
|
||||
ab->is_exact_ = false;
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a|b given a and b.
|
||||
Prefilter::Info* Prefilter::Info::Alt(Info* a, Info* b) {
|
||||
Info *ab = new Info();
|
||||
|
||||
if (a->is_exact_ && b->is_exact_) {
|
||||
CopyIn(a->exact_, &ab->exact_);
|
||||
CopyIn(b->exact_, &ab->exact_);
|
||||
ab->is_exact_ = true;
|
||||
} else {
|
||||
// Either a or b has is_exact_ = false. If the other
|
||||
// one has is_exact_ = true, we move it to match_ and
|
||||
// then create a OR of a,b. The resulting Info has
|
||||
// is_exact_ = false.
|
||||
ab->match_ = Prefilter::Or(a->TakeMatch(), b->TakeMatch());
|
||||
ab->is_exact_ = false;
|
||||
}
|
||||
|
||||
delete a;
|
||||
delete b;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a? given a.
|
||||
Prefilter::Info* Prefilter::Info::Quest(Info *a) {
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->is_exact_ = false;
|
||||
ab->match_ = new Prefilter(ALL);
|
||||
delete a;
|
||||
return ab;
|
||||
}
|
||||
|
||||
// Constructs Info for a* given a.
|
||||
// Same as a? -- not much to do.
|
||||
Prefilter::Info* Prefilter::Info::Star(Info *a) {
|
||||
return Quest(a);
|
||||
}
|
||||
|
||||
// Constructs Info for a+ given a. If a was exact set, it isn't
|
||||
// anymore.
|
||||
Prefilter::Info* Prefilter::Info::Plus(Info *a) {
|
||||
Info *ab = new Info();
|
||||
|
||||
ab->match_ = a->TakeMatch();
|
||||
ab->is_exact_ = false;
|
||||
|
||||
delete a;
|
||||
return ab;
|
||||
}
|
||||
|
||||
static string RuneToString(Rune r) {
|
||||
char buf[UTFmax];
|
||||
int n = runetochar(buf, &r);
|
||||
return string(buf, n);
|
||||
}
|
||||
|
||||
static string RuneToStringLatin1(Rune r) {
|
||||
char c = r & 0xff;
|
||||
return string(&c, 1);
|
||||
}
|
||||
|
||||
// Constructs Info for literal rune.
|
||||
Prefilter::Info* Prefilter::Info::Literal(Rune r) {
|
||||
Info* info = new Info();
|
||||
info->exact_.insert(RuneToString(ToLowerRune(r)));
|
||||
info->is_exact_ = true;
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Info for literal rune for Latin1 encoded string.
|
||||
Prefilter::Info* Prefilter::Info::LiteralLatin1(Rune r) {
|
||||
Info* info = new Info();
|
||||
info->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
|
||||
info->is_exact_ = true;
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Info for dot (any character).
|
||||
Prefilter::Info* Prefilter::Info::AnyChar() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(ALL);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for no possible match.
|
||||
Prefilter::Info* Prefilter::Info::NoMatch() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(NONE);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for any possible match.
|
||||
// This Prefilter::Info is valid for any regular expression,
|
||||
// since it makes no assertions whatsoever about the
|
||||
// strings being matched.
|
||||
Prefilter::Info* Prefilter::Info::AnyMatch() {
|
||||
Prefilter::Info *info = new Prefilter::Info();
|
||||
info->match_ = new Prefilter(ALL);
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for just the empty string.
|
||||
Prefilter::Info* Prefilter::Info::EmptyString() {
|
||||
Prefilter::Info* info = new Prefilter::Info();
|
||||
info->is_exact_ = true;
|
||||
info->exact_.insert("");
|
||||
return info;
|
||||
}
|
||||
|
||||
// Constructs Prefilter::Info for a character class.
|
||||
typedef CharClass::iterator CCIter;
|
||||
Prefilter::Info* Prefilter::Info::CClass(CharClass *cc,
|
||||
bool latin1) {
|
||||
if (Trace) {
|
||||
VLOG(0) << "CharClassInfo:";
|
||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||
VLOG(0) << " " << i->lo << "-" << i->hi;
|
||||
}
|
||||
|
||||
// If the class is too large, it's okay to overestimate.
|
||||
if (cc->size() > 10)
|
||||
return AnyChar();
|
||||
|
||||
Prefilter::Info *a = new Prefilter::Info();
|
||||
for (CCIter i = cc->begin(); i != cc->end(); ++i)
|
||||
for (Rune r = i->lo; r <= i->hi; r++) {
|
||||
if (latin1) {
|
||||
a->exact_.insert(RuneToStringLatin1(ToLowerRuneLatin1(r)));
|
||||
} else {
|
||||
a->exact_.insert(RuneToString(ToLowerRune(r)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
a->is_exact_ = true;
|
||||
|
||||
if (Trace) {
|
||||
VLOG(0) << " = " << a->ToString();
|
||||
}
|
||||
|
||||
return a;
|
||||
}
|
||||
|
||||
class Prefilter::Info::Walker : public Regexp::Walker<Prefilter::Info*> {
|
||||
public:
|
||||
Walker(bool latin1) : latin1_(latin1) {}
|
||||
|
||||
virtual Info* PostVisit(
|
||||
Regexp* re, Info* parent_arg,
|
||||
Info* pre_arg,
|
||||
Info** child_args, int nchild_args);
|
||||
|
||||
virtual Info* ShortVisit(
|
||||
Regexp* re,
|
||||
Info* parent_arg);
|
||||
|
||||
bool latin1() { return latin1_; }
|
||||
private:
|
||||
bool latin1_;
|
||||
DISALLOW_COPY_AND_ASSIGN(Walker);
|
||||
};
|
||||
|
||||
Prefilter::Info* Prefilter::BuildInfo(Regexp* re) {
|
||||
if (Trace) {
|
||||
LOG(INFO) << "BuildPrefilter::Info: " << re->ToString();
|
||||
}
|
||||
|
||||
bool latin1 = (re->parse_flags() & Regexp::Latin1) != 0;
|
||||
Prefilter::Info::Walker w(latin1);
|
||||
Prefilter::Info* info = w.WalkExponential(re, NULL, 100000);
|
||||
|
||||
if (w.stopped_early()) {
|
||||
delete info;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
Prefilter::Info* Prefilter::Info::Walker::ShortVisit(
|
||||
Regexp* re, Prefilter::Info* parent_arg) {
|
||||
return AnyMatch();
|
||||
}
|
||||
|
||||
// Constructs the Prefilter::Info for the given regular expression.
|
||||
// Assumes re is simplified.
|
||||
Prefilter::Info* Prefilter::Info::Walker::PostVisit(
|
||||
Regexp* re, Prefilter::Info* parent_arg,
|
||||
Prefilter::Info* pre_arg, Prefilter::Info** child_args,
|
||||
int nchild_args) {
|
||||
Prefilter::Info *info;
|
||||
switch (re->op()) {
|
||||
default:
|
||||
case kRegexpRepeat:
|
||||
LOG(DFATAL) << "Bad regexp op " << re->op();
|
||||
info = EmptyString();
|
||||
break;
|
||||
|
||||
case kRegexpNoMatch:
|
||||
info = NoMatch();
|
||||
break;
|
||||
|
||||
// These ops match the empty string:
|
||||
case kRegexpEmptyMatch: // anywhere
|
||||
case kRegexpBeginLine: // at beginning of line
|
||||
case kRegexpEndLine: // at end of line
|
||||
case kRegexpBeginText: // at beginning of text
|
||||
case kRegexpEndText: // at end of text
|
||||
case kRegexpWordBoundary: // at word boundary
|
||||
case kRegexpNoWordBoundary: // not at word boundary
|
||||
info = EmptyString();
|
||||
break;
|
||||
|
||||
case kRegexpLiteral:
|
||||
if (latin1()) {
|
||||
info = LiteralLatin1(re->rune());
|
||||
}
|
||||
else {
|
||||
info = Literal(re->rune());
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
if (re->nrunes() == 0) {
|
||||
info = NoMatch();
|
||||
break;
|
||||
}
|
||||
if (latin1()) {
|
||||
info = LiteralLatin1(re->runes()[0]);
|
||||
for (int i = 1; i < re->nrunes(); i++) {
|
||||
info = Concat(info, LiteralLatin1(re->runes()[i]));
|
||||
}
|
||||
} else {
|
||||
info = Literal(re->runes()[0]);
|
||||
for (int i = 1; i < re->nrunes(); i++) {
|
||||
info = Concat(info, Literal(re->runes()[i]));
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpConcat: {
|
||||
// Accumulate in info.
|
||||
// Exact is concat of recent contiguous exact nodes.
|
||||
info = NULL;
|
||||
Info* exact = NULL;
|
||||
for (int i = 0; i < nchild_args; i++) {
|
||||
Info* ci = child_args[i]; // child info
|
||||
if (!ci->is_exact() ||
|
||||
(exact && ci->exact().size() * exact->exact().size() > 16)) {
|
||||
// Exact run is over.
|
||||
info = And(info, exact);
|
||||
exact = NULL;
|
||||
// Add this child's info.
|
||||
info = And(info, ci);
|
||||
} else {
|
||||
// Append to exact run.
|
||||
exact = Concat(exact, ci);
|
||||
}
|
||||
}
|
||||
info = And(info, exact);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpAlternate:
|
||||
info = child_args[0];
|
||||
for (int i = 1; i < nchild_args; i++)
|
||||
info = Alt(info, child_args[i]);
|
||||
VLOG(10) << "Alt: " << info->ToString();
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
info = Star(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpQuest:
|
||||
info = Quest(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpPlus:
|
||||
info = Plus(child_args[0]);
|
||||
break;
|
||||
|
||||
case kRegexpAnyChar:
|
||||
// Claim nothing, except that it's not empty.
|
||||
info = AnyChar();
|
||||
break;
|
||||
|
||||
case kRegexpCharClass:
|
||||
info = CClass(re->cc(), latin1());
|
||||
break;
|
||||
|
||||
case kRegexpCapture:
|
||||
// These don't affect the set of matching strings.
|
||||
info = child_args[0];
|
||||
break;
|
||||
}
|
||||
|
||||
if (Trace) {
|
||||
VLOG(0) << "BuildInfo " << re->ToString()
|
||||
<< ": " << (info ? info->ToString() : "");
|
||||
}
|
||||
|
||||
return info;
|
||||
}
|
||||
|
||||
|
||||
Prefilter* Prefilter::FromRegexp(Regexp* re) {
|
||||
if (re == NULL)
|
||||
return NULL;
|
||||
|
||||
Regexp* simple = re->Simplify();
|
||||
Prefilter::Info *info = BuildInfo(simple);
|
||||
|
||||
simple->Decref();
|
||||
if (info == NULL)
|
||||
return NULL;
|
||||
|
||||
Prefilter* m = info->TakeMatch();
|
||||
|
||||
delete info;
|
||||
return m;
|
||||
}
|
||||
|
||||
string Prefilter::DebugString() const {
|
||||
switch (op_) {
|
||||
default:
|
||||
LOG(DFATAL) << "Bad op in Prefilter::DebugString: " << op_;
|
||||
return StringPrintf("op%d", op_);
|
||||
case NONE:
|
||||
return "*no-matches*";
|
||||
case ATOM:
|
||||
return atom_;
|
||||
case ALL:
|
||||
return "";
|
||||
case AND: {
|
||||
string s = "";
|
||||
for (size_t i = 0; i < subs_->size(); i++) {
|
||||
if (i > 0)
|
||||
s += " ";
|
||||
Prefilter* sub = (*subs_)[i];
|
||||
s += sub ? sub->DebugString() : "<nil>";
|
||||
}
|
||||
return s;
|
||||
}
|
||||
case OR: {
|
||||
string s = "(";
|
||||
for (size_t i = 0; i < subs_->size(); i++) {
|
||||
if (i > 0)
|
||||
s += "|";
|
||||
Prefilter* sub = (*subs_)[i];
|
||||
s += sub ? sub->DebugString() : "<nil>";
|
||||
}
|
||||
s += ")";
|
||||
return s;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Prefilter* Prefilter::FromRE2(const RE2* re2) {
|
||||
if (re2 == NULL)
|
||||
return NULL;
|
||||
|
||||
Regexp* regexp = re2->Regexp();
|
||||
if (regexp == NULL)
|
||||
return NULL;
|
||||
|
||||
return FromRegexp(regexp);
|
||||
}
|
||||
|
||||
|
||||
} // namespace re2
|
105
third_party/re2/re2/prefilter.h
vendored
105
third_party/re2/re2/prefilter.h
vendored
@ -1,105 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Prefilter is the class used to extract string guards from regexps.
|
||||
// Rather than using Prefilter class directly, use FilteredRE2.
|
||||
// See filtered_re2.h
|
||||
|
||||
#ifndef RE2_PREFILTER_H_
|
||||
#define RE2_PREFILTER_H_
|
||||
|
||||
#include "util/util.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class RE2;
|
||||
|
||||
class Regexp;
|
||||
|
||||
class Prefilter {
|
||||
// Instead of using Prefilter directly, use FilteredRE2; see filtered_re2.h
|
||||
public:
|
||||
enum Op {
|
||||
ALL = 0, // Everything matches
|
||||
NONE, // Nothing matches
|
||||
ATOM, // The string atom() must match
|
||||
AND, // All in subs() must match
|
||||
OR, // One of subs() must match
|
||||
};
|
||||
|
||||
explicit Prefilter(Op op);
|
||||
~Prefilter();
|
||||
|
||||
Op op() { return op_; }
|
||||
const string& atom() const { return atom_; }
|
||||
void set_unique_id(int id) { unique_id_ = id; }
|
||||
int unique_id() const { return unique_id_; }
|
||||
|
||||
// The children of the Prefilter node.
|
||||
vector<Prefilter*>* subs() {
|
||||
CHECK(op_ == AND || op_ == OR);
|
||||
return subs_;
|
||||
}
|
||||
|
||||
// Set the children vector. Prefilter takes ownership of subs and
|
||||
// subs_ will be deleted when Prefilter is deleted.
|
||||
void set_subs(vector<Prefilter*>* subs) { subs_ = subs; }
|
||||
|
||||
// Given a RE2, return a Prefilter. The caller takes ownership of
|
||||
// the Prefilter and should deallocate it. Returns NULL if Prefilter
|
||||
// cannot be formed.
|
||||
static Prefilter* FromRE2(const RE2* re2);
|
||||
|
||||
// Returns a readable debug string of the prefilter.
|
||||
string DebugString() const;
|
||||
|
||||
private:
|
||||
class Info;
|
||||
|
||||
// Combines two prefilters together to create an AND. The passed
|
||||
// Prefilters will be part of the returned Prefilter or deleted.
|
||||
static Prefilter* And(Prefilter* a, Prefilter* b);
|
||||
|
||||
// Combines two prefilters together to create an OR. The passed
|
||||
// Prefilters will be part of the returned Prefilter or deleted.
|
||||
static Prefilter* Or(Prefilter* a, Prefilter* b);
|
||||
|
||||
// Generalized And/Or
|
||||
static Prefilter* AndOr(Op op, Prefilter* a, Prefilter* b);
|
||||
|
||||
static Prefilter* FromRegexp(Regexp* a);
|
||||
|
||||
static Prefilter* FromString(const string& str);
|
||||
|
||||
static Prefilter* OrStrings(set<string>* ss);
|
||||
|
||||
static Info* BuildInfo(Regexp* re);
|
||||
|
||||
Prefilter* Simplify();
|
||||
|
||||
// Kind of Prefilter.
|
||||
Op op_;
|
||||
|
||||
// Sub-matches for AND or OR Prefilter.
|
||||
vector<Prefilter*>* subs_;
|
||||
|
||||
// Actual string to match in leaf node.
|
||||
string atom_;
|
||||
|
||||
// If different prefilters have the same string atom, or if they are
|
||||
// structurally the same (e.g., OR of same atom strings) they are
|
||||
// considered the same unique nodes. This is the id for each unique
|
||||
// node. This field is populated with a unique id for every node,
|
||||
// and -1 for duplicate nodes.
|
||||
int unique_id_;
|
||||
|
||||
// Used for debugging, helps in tracking memory leaks.
|
||||
int alloc_id_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(Prefilter);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_PREFILTER_H_
|
402
third_party/re2/re2/prefilter_tree.cc
vendored
402
third_party/re2/re2/prefilter_tree.cc
vendored
@ -1,402 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/flags.h"
|
||||
#include "re2/prefilter.h"
|
||||
#include "re2/prefilter_tree.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
DEFINE_int32(filtered_re2_min_atom_len,
|
||||
3,
|
||||
"Strings less than this length are not stored as atoms");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
PrefilterTree::PrefilterTree()
|
||||
: compiled_(false) {
|
||||
}
|
||||
|
||||
PrefilterTree::~PrefilterTree() {
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++)
|
||||
delete prefilter_vec_[i];
|
||||
|
||||
for (size_t i = 0; i < entries_.size(); i++)
|
||||
delete entries_[i].parents;
|
||||
}
|
||||
|
||||
// Functions used for adding and Compiling prefilters to the
|
||||
// PrefilterTree.
|
||||
static bool KeepPart(Prefilter* prefilter, int level) {
|
||||
if (prefilter == NULL)
|
||||
return false;
|
||||
|
||||
switch (prefilter->op()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected op in KeepPart: "
|
||||
<< prefilter->op();
|
||||
return false;
|
||||
|
||||
case Prefilter::ALL:
|
||||
return false;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
return prefilter->atom().size() >=
|
||||
static_cast<size_t>(FLAGS_filtered_re2_min_atom_len);
|
||||
|
||||
case Prefilter::AND: {
|
||||
int j = 0;
|
||||
vector<Prefilter*>* subs = prefilter->subs();
|
||||
for (size_t i = 0; i < subs->size(); i++)
|
||||
if (KeepPart((*subs)[i], level + 1))
|
||||
(*subs)[j++] = (*subs)[i];
|
||||
else
|
||||
delete (*subs)[i];
|
||||
|
||||
subs->resize(j);
|
||||
return j > 0;
|
||||
}
|
||||
|
||||
case Prefilter::OR:
|
||||
for (size_t i = 0; i < prefilter->subs()->size(); i++)
|
||||
if (!KeepPart((*prefilter->subs())[i], level + 1))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
void PrefilterTree::Add(Prefilter *f) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "Add after Compile.";
|
||||
return;
|
||||
}
|
||||
if (f != NULL && !KeepPart(f, 0)) {
|
||||
delete f;
|
||||
f = NULL;
|
||||
}
|
||||
|
||||
prefilter_vec_.push_back(f);
|
||||
}
|
||||
|
||||
void PrefilterTree::Compile(vector<string>* atom_vec) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "Compile after Compile.";
|
||||
return;
|
||||
}
|
||||
|
||||
// We do this check to support some legacy uses of
|
||||
// PrefilterTree that call Compile before adding any regexps,
|
||||
// and expect Compile not to have effect.
|
||||
if (prefilter_vec_.empty())
|
||||
return;
|
||||
|
||||
compiled_ = true;
|
||||
|
||||
AssignUniqueIds(atom_vec);
|
||||
|
||||
// Identify nodes that are too common among prefilters and are
|
||||
// triggering too many parents. Then get rid of them if possible.
|
||||
// Note that getting rid of a prefilter node simply means they are
|
||||
// no longer necessary for their parent to trigger; that is, we do
|
||||
// not miss out on any regexps triggering by getting rid of a
|
||||
// prefilter node.
|
||||
for (size_t i = 0; i < entries_.size(); i++) {
|
||||
StdIntMap* parents = entries_[i].parents;
|
||||
if (parents->size() > 8) {
|
||||
// This one triggers too many things. If all the parents are AND
|
||||
// nodes and have other things guarding them, then get rid of
|
||||
// this trigger. TODO(vsri): Adjust the threshold appropriately,
|
||||
// make it a function of total number of nodes?
|
||||
bool have_other_guard = true;
|
||||
for (StdIntMap::iterator it = parents->begin();
|
||||
it != parents->end(); ++it) {
|
||||
have_other_guard = have_other_guard &&
|
||||
(entries_[it->first].propagate_up_at_count > 1);
|
||||
}
|
||||
|
||||
if (have_other_guard) {
|
||||
for (StdIntMap::iterator it = parents->begin();
|
||||
it != parents->end(); ++it)
|
||||
entries_[it->first].propagate_up_at_count -= 1;
|
||||
|
||||
parents->clear(); // Forget the parents
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
PrintDebugInfo();
|
||||
}
|
||||
|
||||
Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
|
||||
string node_string = NodeString(node);
|
||||
map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
|
||||
if (iter == node_map_.end())
|
||||
return NULL;
|
||||
return (*iter).second;
|
||||
}
|
||||
|
||||
static string Itoa(int n) {
|
||||
char buf[100];
|
||||
snprintf(buf, sizeof buf, "%d", n);
|
||||
return string(buf);
|
||||
}
|
||||
|
||||
string PrefilterTree::NodeString(Prefilter* node) const {
|
||||
// Adding the operation disambiguates AND/OR/atom nodes.
|
||||
string s = Itoa(node->op()) + ":";
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
s += node->atom();
|
||||
} else {
|
||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
||||
if (i > 0)
|
||||
s += ',';
|
||||
s += Itoa((*node->subs())[i]->unique_id());
|
||||
}
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
|
||||
atom_vec->clear();
|
||||
|
||||
// Build vector of all filter nodes, sorted topologically
|
||||
// from top to bottom in v.
|
||||
vector<Prefilter*> v;
|
||||
|
||||
// Add the top level nodes of each regexp prefilter.
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||
Prefilter* f = prefilter_vec_[i];
|
||||
if (f == NULL)
|
||||
unfiltered_.push_back(static_cast<int>(i));
|
||||
|
||||
// We push NULL also on to v, so that we maintain the
|
||||
// mapping of index==regexpid for level=0 prefilter nodes.
|
||||
v.push_back(f);
|
||||
}
|
||||
|
||||
// Now add all the descendant nodes.
|
||||
for (size_t i = 0; i < v.size(); i++) {
|
||||
Prefilter* f = v[i];
|
||||
if (f == NULL)
|
||||
continue;
|
||||
if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
|
||||
const vector<Prefilter*>& subs = *f->subs();
|
||||
for (size_t j = 0; j < subs.size(); j++)
|
||||
v.push_back(subs[j]);
|
||||
}
|
||||
}
|
||||
|
||||
// Identify unique nodes.
|
||||
int unique_id = 0;
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter *node = v[i];
|
||||
if (node == NULL)
|
||||
continue;
|
||||
node->set_unique_id(-1);
|
||||
Prefilter* canonical = CanonicalNode(node);
|
||||
if (canonical == NULL) {
|
||||
// Any further nodes that have the same node string
|
||||
// will find this node as the canonical node.
|
||||
node_map_[NodeString(node)] = node;
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
atom_vec->push_back(node->atom());
|
||||
atom_index_to_id_.push_back(unique_id);
|
||||
}
|
||||
node->set_unique_id(unique_id++);
|
||||
} else {
|
||||
node->set_unique_id(canonical->unique_id());
|
||||
}
|
||||
}
|
||||
entries_.resize(node_map_.size());
|
||||
|
||||
// Create parent StdIntMap for the entries.
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
if (prefilter == NULL)
|
||||
continue;
|
||||
|
||||
if (CanonicalNode(prefilter) != prefilter)
|
||||
continue;
|
||||
|
||||
Entry* entry = &entries_[prefilter->unique_id()];
|
||||
entry->parents = new StdIntMap();
|
||||
}
|
||||
|
||||
// Fill the entries.
|
||||
for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
|
||||
Prefilter* prefilter = v[i];
|
||||
if (prefilter == NULL)
|
||||
continue;
|
||||
|
||||
if (CanonicalNode(prefilter) != prefilter)
|
||||
continue;
|
||||
|
||||
Entry* entry = &entries_[prefilter->unique_id()];
|
||||
|
||||
switch (prefilter->op()) {
|
||||
default:
|
||||
case Prefilter::ALL:
|
||||
LOG(DFATAL) << "Unexpected op: " << prefilter->op();
|
||||
return;
|
||||
|
||||
case Prefilter::ATOM:
|
||||
entry->propagate_up_at_count = 1;
|
||||
break;
|
||||
|
||||
case Prefilter::OR:
|
||||
case Prefilter::AND: {
|
||||
set<int> uniq_child;
|
||||
for (size_t j = 0; j < prefilter->subs()->size(); j++) {
|
||||
Prefilter* child = (*prefilter->subs())[j];
|
||||
Prefilter* canonical = CanonicalNode(child);
|
||||
if (canonical == NULL) {
|
||||
LOG(DFATAL) << "Null canonical node";
|
||||
return;
|
||||
}
|
||||
int child_id = canonical->unique_id();
|
||||
uniq_child.insert(child_id);
|
||||
// To the child, we want to add to parent indices.
|
||||
Entry* child_entry = &entries_[child_id];
|
||||
if (child_entry->parents->find(prefilter->unique_id()) ==
|
||||
child_entry->parents->end()) {
|
||||
(*child_entry->parents)[prefilter->unique_id()] = 1;
|
||||
}
|
||||
}
|
||||
entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
|
||||
? static_cast<int>(uniq_child.size())
|
||||
: 1;
|
||||
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For top level nodes, populate regexp id.
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); i++) {
|
||||
if (prefilter_vec_[i] == NULL)
|
||||
continue;
|
||||
int id = CanonicalNode(prefilter_vec_[i])->unique_id();
|
||||
DCHECK_LE(0, id);
|
||||
Entry* entry = &entries_[id];
|
||||
entry->regexps.push_back(static_cast<int>(i));
|
||||
}
|
||||
}
|
||||
|
||||
// Functions for triggering during search.
|
||||
void PrefilterTree::RegexpsGivenStrings(
|
||||
const vector<int>& matched_atoms,
|
||||
vector<int>* regexps) const {
|
||||
regexps->clear();
|
||||
if (!compiled_) {
|
||||
LOG(WARNING) << "Compile() not called";
|
||||
for (size_t i = 0; i < prefilter_vec_.size(); ++i)
|
||||
regexps->push_back(static_cast<int>(i));
|
||||
} else {
|
||||
if (!prefilter_vec_.empty()) {
|
||||
IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
|
||||
vector<int> matched_atom_ids;
|
||||
for (size_t j = 0; j < matched_atoms.size(); j++) {
|
||||
matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
|
||||
VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
|
||||
}
|
||||
PropagateMatch(matched_atom_ids, ®exps_map);
|
||||
for (IntMap::iterator it = regexps_map.begin();
|
||||
it != regexps_map.end();
|
||||
++it)
|
||||
regexps->push_back(it->index());
|
||||
|
||||
regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
|
||||
}
|
||||
}
|
||||
sort(regexps->begin(), regexps->end());
|
||||
}
|
||||
|
||||
void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
|
||||
IntMap* regexps) const {
|
||||
IntMap count(static_cast<int>(entries_.size()));
|
||||
IntMap work(static_cast<int>(entries_.size()));
|
||||
for (size_t i = 0; i < atom_ids.size(); i++)
|
||||
work.set(atom_ids[i], 1);
|
||||
for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
|
||||
const Entry& entry = entries_[it->index()];
|
||||
VLOG(10) << "Processing: " << it->index();
|
||||
// Record regexps triggered.
|
||||
for (size_t i = 0; i < entry.regexps.size(); i++) {
|
||||
VLOG(10) << "Regexp triggered: " << entry.regexps[i];
|
||||
regexps->set(entry.regexps[i], 1);
|
||||
}
|
||||
int c;
|
||||
// Pass trigger up to parents.
|
||||
for (StdIntMap::iterator it = entry.parents->begin();
|
||||
it != entry.parents->end();
|
||||
++it) {
|
||||
int j = it->first;
|
||||
const Entry& parent = entries_[j];
|
||||
VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
|
||||
// Delay until all the children have succeeded.
|
||||
if (parent.propagate_up_at_count > 1) {
|
||||
if (count.has_index(j)) {
|
||||
c = count.get_existing(j) + 1;
|
||||
count.set_existing(j, c);
|
||||
} else {
|
||||
c = 1;
|
||||
count.set_new(j, c);
|
||||
}
|
||||
if (c < parent.propagate_up_at_count)
|
||||
continue;
|
||||
}
|
||||
VLOG(10) << "Triggering: " << j;
|
||||
// Trigger the parent.
|
||||
work.set(j, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Debugging help.
|
||||
void PrefilterTree::PrintPrefilter(int regexpid) {
|
||||
LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
|
||||
}
|
||||
|
||||
void PrefilterTree::PrintDebugInfo() {
|
||||
VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
|
||||
VLOG(10) << "#Unique Nodes: " << entries_.size();
|
||||
|
||||
for (size_t i = 0; i < entries_.size(); ++i) {
|
||||
StdIntMap* parents = entries_[i].parents;
|
||||
const vector<int>& regexps = entries_[i].regexps;
|
||||
VLOG(10) << "EntryId: " << i
|
||||
<< " N: " << parents->size() << " R: " << regexps.size();
|
||||
for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
|
||||
VLOG(10) << it->first;
|
||||
}
|
||||
VLOG(10) << "Map:";
|
||||
for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
|
||||
iter != node_map_.end(); ++iter)
|
||||
VLOG(10) << "NodeId: " << (*iter).second->unique_id()
|
||||
<< " Str: " << (*iter).first;
|
||||
}
|
||||
|
||||
string PrefilterTree::DebugNodeString(Prefilter* node) const {
|
||||
string node_string = "";
|
||||
|
||||
if (node->op() == Prefilter::ATOM) {
|
||||
DCHECK(!node->atom().empty());
|
||||
node_string += node->atom();
|
||||
} else {
|
||||
// Adding the operation disambiguates AND and OR nodes.
|
||||
node_string += node->op() == Prefilter::AND ? "AND" : "OR";
|
||||
node_string += "(";
|
||||
for (size_t i = 0; i < node->subs()->size(); i++) {
|
||||
if (i > 0)
|
||||
node_string += ',';
|
||||
node_string += Itoa((*node->subs())[i]->unique_id());
|
||||
node_string += ":";
|
||||
node_string += DebugNodeString((*node->subs())[i]);
|
||||
}
|
||||
node_string += ")";
|
||||
}
|
||||
return node_string;
|
||||
}
|
||||
|
||||
} // namespace re2
|
131
third_party/re2/re2/prefilter_tree.h
vendored
131
third_party/re2/re2/prefilter_tree.h
vendored
@ -1,131 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// The PrefilterTree class is used to form an AND-OR tree of strings
|
||||
// that would trigger each regexp. The 'prefilter' of each regexp is
|
||||
// added tp PrefilterTree, and then PrefilterTree is used to find all
|
||||
// the unique strings across the prefilters. During search, by using
|
||||
// matches from a string matching engine, PrefilterTree deduces the
|
||||
// set of regexps that are to be triggered. The 'string matching
|
||||
// engine' itself is outside of this class, and the caller can use any
|
||||
// favorite engine. PrefilterTree provides a set of strings (called
|
||||
// atoms) that the user of this class should use to do the string
|
||||
// matching.
|
||||
//
|
||||
#ifndef RE2_PREFILTER_TREE_H_
|
||||
#define RE2_PREFILTER_TREE_H_
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_array.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
typedef SparseArray<int> IntMap;
|
||||
typedef map<int, int> StdIntMap;
|
||||
|
||||
class Prefilter;
|
||||
|
||||
class PrefilterTree {
|
||||
public:
|
||||
PrefilterTree();
|
||||
~PrefilterTree();
|
||||
|
||||
// Adds the prefilter for the next regexp. Note that we assume that
|
||||
// Add called sequentially for all regexps. All Add calls
|
||||
// must precede Compile.
|
||||
void Add(Prefilter* prefilter);
|
||||
|
||||
// The Compile returns a vector of string in atom_vec.
|
||||
// Call this after all the prefilters are added through Add.
|
||||
// No calls to Add after Compile are allowed.
|
||||
// The caller should use the returned set of strings to do string matching.
|
||||
// Each time a string matches, the corresponding index then has to be
|
||||
// and passed to RegexpsGivenStrings below.
|
||||
void Compile(vector<string>* atom_vec);
|
||||
|
||||
// Given the indices of the atoms that matched, returns the indexes
|
||||
// of regexps that should be searched. The matched_atoms should
|
||||
// contain all the ids of string atoms that were found to match the
|
||||
// content. The caller can use any string match engine to perform
|
||||
// this function. This function is thread safe.
|
||||
void RegexpsGivenStrings(const vector<int>& matched_atoms,
|
||||
vector<int>* regexps) const;
|
||||
|
||||
// Print debug prefilter. Also prints unique ids associated with
|
||||
// nodes of the prefilter of the regexp.
|
||||
void PrintPrefilter(int regexpid);
|
||||
|
||||
|
||||
// Each unique node has a corresponding Entry that helps in
|
||||
// passing the matching trigger information along the tree.
|
||||
struct Entry {
|
||||
public:
|
||||
// How many children should match before this node triggers the
|
||||
// parent. For an atom and an OR node, this is 1 and for an AND
|
||||
// node, it is the number of unique children.
|
||||
int propagate_up_at_count;
|
||||
|
||||
// When this node is ready to trigger the parent, what are the indices
|
||||
// of the parent nodes to trigger. The reason there may be more than
|
||||
// one is because of sharing. For example (abc | def) and (xyz | def)
|
||||
// are two different nodes, but they share the atom 'def'. So when
|
||||
// 'def' matches, it triggers two parents, corresponding to the two
|
||||
// different OR nodes.
|
||||
StdIntMap* parents;
|
||||
|
||||
// When this node is ready to trigger the parent, what are the
|
||||
// regexps that are triggered.
|
||||
vector<int> regexps;
|
||||
};
|
||||
|
||||
private:
|
||||
// This function assigns unique ids to various parts of the
|
||||
// prefilter, by looking at if these nodes are already in the
|
||||
// PrefilterTree.
|
||||
void AssignUniqueIds(vector<string>* atom_vec);
|
||||
|
||||
// Given the matching atoms, find the regexps to be triggered.
|
||||
void PropagateMatch(const vector<int>& atom_ids,
|
||||
IntMap* regexps) const;
|
||||
|
||||
// Returns the prefilter node that has the same NodeString as this
|
||||
// node. For the canonical node, returns node.
|
||||
Prefilter* CanonicalNode(Prefilter* node);
|
||||
|
||||
// A string that uniquely identifies the node. Assumes that the
|
||||
// children of node has already been assigned unique ids.
|
||||
string NodeString(Prefilter* node) const;
|
||||
|
||||
// Recursively constructs a readable prefilter string.
|
||||
string DebugNodeString(Prefilter* node) const;
|
||||
|
||||
// Used for debugging.
|
||||
void PrintDebugInfo();
|
||||
|
||||
// These are all the nodes formed by Compile. Essentially, there is
|
||||
// one node for each unique atom and each unique AND/OR node.
|
||||
vector<Entry> entries_;
|
||||
|
||||
// Map node string to canonical Prefilter node.
|
||||
map<string, Prefilter*> node_map_;
|
||||
|
||||
// indices of regexps that always pass through the filter (since we
|
||||
// found no required literals in these regexps).
|
||||
vector<int> unfiltered_;
|
||||
|
||||
// vector of Prefilter for all regexps.
|
||||
vector<Prefilter*> prefilter_vec_;
|
||||
|
||||
// Atom index in returned strings to entry id mapping.
|
||||
vector<int> atom_index_to_id_;
|
||||
|
||||
// Has the prefilter tree been compiled.
|
||||
bool compiled_;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(PrefilterTree);
|
||||
};
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // RE2_PREFILTER_TREE_H_
|
343
third_party/re2/re2/prog.cc
vendored
343
third_party/re2/re2/prog.cc
vendored
@ -1,343 +0,0 @@
|
||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Compiled regular expression representation.
|
||||
// Tested by compile_test.cc
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_set.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Constructors per Inst opcode
|
||||
|
||||
void Prog::Inst::InitAlt(uint32 out, uint32 out1) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstAlt);
|
||||
out1_ = out1;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitByteRange(int lo, int hi, int foldcase, uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstByteRange);
|
||||
lo_ = lo & 0xFF;
|
||||
hi_ = hi & 0xFF;
|
||||
foldcase_ = foldcase & 0xFF;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitCapture(int cap, uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstCapture);
|
||||
cap_ = cap;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitEmptyWidth(EmptyOp empty, uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_out_opcode(out, kInstEmptyWidth);
|
||||
empty_ = empty;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitMatch(int32 id) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstMatch);
|
||||
match_id_ = id;
|
||||
}
|
||||
|
||||
void Prog::Inst::InitNop(uint32 out) {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstNop);
|
||||
}
|
||||
|
||||
void Prog::Inst::InitFail() {
|
||||
DCHECK_EQ(out_opcode_, 0);
|
||||
set_opcode(kInstFail);
|
||||
}
|
||||
|
||||
string Prog::Inst::Dump() {
|
||||
switch (opcode()) {
|
||||
default:
|
||||
return StringPrintf("opcode %d", static_cast<int>(opcode()));
|
||||
|
||||
case kInstAlt:
|
||||
return StringPrintf("alt -> %d | %d", out(), out1_);
|
||||
|
||||
case kInstAltMatch:
|
||||
return StringPrintf("altmatch -> %d | %d", out(), out1_);
|
||||
|
||||
case kInstByteRange:
|
||||
return StringPrintf("byte%s [%02x-%02x] -> %d",
|
||||
foldcase_ ? "/i" : "",
|
||||
lo_, hi_, out());
|
||||
|
||||
case kInstCapture:
|
||||
return StringPrintf("capture %d -> %d", cap_, out());
|
||||
|
||||
case kInstEmptyWidth:
|
||||
return StringPrintf("emptywidth %#x -> %d",
|
||||
static_cast<int>(empty_), out());
|
||||
|
||||
case kInstMatch:
|
||||
return StringPrintf("match! %d", match_id());
|
||||
|
||||
case kInstNop:
|
||||
return StringPrintf("nop -> %d", out());
|
||||
|
||||
case kInstFail:
|
||||
return StringPrintf("fail");
|
||||
}
|
||||
}
|
||||
|
||||
Prog::Prog()
|
||||
: anchor_start_(false),
|
||||
anchor_end_(false),
|
||||
reversed_(false),
|
||||
did_onepass_(false),
|
||||
start_(0),
|
||||
start_unanchored_(0),
|
||||
size_(0),
|
||||
byte_inst_count_(0),
|
||||
bytemap_range_(0),
|
||||
flags_(0),
|
||||
onepass_statesize_(0),
|
||||
inst_(NULL),
|
||||
dfa_first_(NULL),
|
||||
dfa_longest_(NULL),
|
||||
dfa_mem_(0),
|
||||
delete_dfa_(NULL),
|
||||
unbytemap_(NULL),
|
||||
onepass_nodes_(NULL),
|
||||
onepass_start_(NULL) {
|
||||
}
|
||||
|
||||
Prog::~Prog() {
|
||||
if (delete_dfa_) {
|
||||
if (dfa_first_)
|
||||
delete_dfa_(dfa_first_);
|
||||
if (dfa_longest_)
|
||||
delete_dfa_(dfa_longest_);
|
||||
}
|
||||
delete[] onepass_nodes_;
|
||||
delete[] inst_;
|
||||
delete[] unbytemap_;
|
||||
}
|
||||
|
||||
typedef SparseSet Workq;
|
||||
|
||||
static inline void AddToQueue(Workq* q, int id) {
|
||||
if (id != 0)
|
||||
q->insert(id);
|
||||
}
|
||||
|
||||
static string ProgToString(Prog* prog, Workq* q) {
|
||||
string s;
|
||||
|
||||
for (Workq::iterator i = q->begin(); i != q->end(); ++i) {
|
||||
int id = *i;
|
||||
Prog::Inst* ip = prog->inst(id);
|
||||
StringAppendF(&s, "%d. %s\n", id, ip->Dump().c_str());
|
||||
AddToQueue(q, ip->out());
|
||||
if (ip->opcode() == kInstAlt || ip->opcode() == kInstAltMatch)
|
||||
AddToQueue(q, ip->out1());
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
string Prog::Dump() {
|
||||
string map;
|
||||
if (false) { // Debugging
|
||||
int lo = 0;
|
||||
StringAppendF(&map, "byte map:\n");
|
||||
for (int i = 0; i < bytemap_range_; i++) {
|
||||
StringAppendF(&map, "\t%d. [%02x-%02x]\n", i, lo, unbytemap_[i]);
|
||||
lo = unbytemap_[i] + 1;
|
||||
}
|
||||
StringAppendF(&map, "\n");
|
||||
}
|
||||
|
||||
Workq q(size_);
|
||||
AddToQueue(&q, start_);
|
||||
return map + ProgToString(this, &q);
|
||||
}
|
||||
|
||||
string Prog::DumpUnanchored() {
|
||||
Workq q(size_);
|
||||
AddToQueue(&q, start_unanchored_);
|
||||
return ProgToString(this, &q);
|
||||
}
|
||||
|
||||
static bool IsMatch(Prog*, Prog::Inst*);
|
||||
|
||||
// Peep-hole optimizer.
|
||||
void Prog::Optimize() {
|
||||
Workq q(size_);
|
||||
|
||||
// Eliminate nops. Most are taken out during compilation
|
||||
// but a few are hard to avoid.
|
||||
q.clear();
|
||||
AddToQueue(&q, start_);
|
||||
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
||||
int id = *i;
|
||||
|
||||
Inst* ip = inst(id);
|
||||
int j = ip->out();
|
||||
Inst* jp;
|
||||
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
||||
j = jp->out();
|
||||
}
|
||||
ip->set_out(j);
|
||||
AddToQueue(&q, ip->out());
|
||||
|
||||
if (ip->opcode() == kInstAlt) {
|
||||
j = ip->out1();
|
||||
while (j != 0 && (jp=inst(j))->opcode() == kInstNop) {
|
||||
j = jp->out();
|
||||
}
|
||||
ip->out1_ = j;
|
||||
AddToQueue(&q, ip->out1());
|
||||
}
|
||||
}
|
||||
|
||||
// Insert kInstAltMatch instructions
|
||||
// Look for
|
||||
// ip: Alt -> j | k
|
||||
// j: ByteRange [00-FF] -> ip
|
||||
// k: Match
|
||||
// or the reverse (the above is the greedy one).
|
||||
// Rewrite Alt to AltMatch.
|
||||
q.clear();
|
||||
AddToQueue(&q, start_);
|
||||
for (Workq::iterator i = q.begin(); i != q.end(); ++i) {
|
||||
int id = *i;
|
||||
Inst* ip = inst(id);
|
||||
AddToQueue(&q, ip->out());
|
||||
if (ip->opcode() == kInstAlt)
|
||||
AddToQueue(&q, ip->out1());
|
||||
|
||||
if (ip->opcode() == kInstAlt) {
|
||||
Inst* j = inst(ip->out());
|
||||
Inst* k = inst(ip->out1());
|
||||
if (j->opcode() == kInstByteRange && j->out() == id &&
|
||||
j->lo() == 0x00 && j->hi() == 0xFF &&
|
||||
IsMatch(this, k)) {
|
||||
ip->set_opcode(kInstAltMatch);
|
||||
continue;
|
||||
}
|
||||
if (IsMatch(this, j) &&
|
||||
k->opcode() == kInstByteRange && k->out() == id &&
|
||||
k->lo() == 0x00 && k->hi() == 0xFF) {
|
||||
ip->set_opcode(kInstAltMatch);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Is ip a guaranteed match at end of text, perhaps after some capturing?
|
||||
static bool IsMatch(Prog* prog, Prog::Inst* ip) {
|
||||
for (;;) {
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(DFATAL) << "Unexpected opcode in IsMatch: " << ip->opcode();
|
||||
return false;
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
case kInstByteRange:
|
||||
case kInstFail:
|
||||
case kInstEmptyWidth:
|
||||
return false;
|
||||
|
||||
case kInstCapture:
|
||||
case kInstNop:
|
||||
ip = prog->inst(ip->out());
|
||||
break;
|
||||
|
||||
case kInstMatch:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32 Prog::EmptyFlags(const StringPiece& text, const char* p) {
|
||||
int flags = 0;
|
||||
|
||||
// ^ and \A
|
||||
if (p == text.begin())
|
||||
flags |= kEmptyBeginText | kEmptyBeginLine;
|
||||
else if (p[-1] == '\n')
|
||||
flags |= kEmptyBeginLine;
|
||||
|
||||
// $ and \z
|
||||
if (p == text.end())
|
||||
flags |= kEmptyEndText | kEmptyEndLine;
|
||||
else if (p < text.end() && p[0] == '\n')
|
||||
flags |= kEmptyEndLine;
|
||||
|
||||
// \b and \B
|
||||
if (p == text.begin() && p == text.end()) {
|
||||
// no word boundary here
|
||||
} else if (p == text.begin()) {
|
||||
if (IsWordChar(p[0]))
|
||||
flags |= kEmptyWordBoundary;
|
||||
} else if (p == text.end()) {
|
||||
if (IsWordChar(p[-1]))
|
||||
flags |= kEmptyWordBoundary;
|
||||
} else {
|
||||
if (IsWordChar(p[-1]) != IsWordChar(p[0]))
|
||||
flags |= kEmptyWordBoundary;
|
||||
}
|
||||
if (!(flags & kEmptyWordBoundary))
|
||||
flags |= kEmptyNonWordBoundary;
|
||||
|
||||
return flags;
|
||||
}
|
||||
|
||||
void Prog::MarkByteRange(int lo, int hi) {
|
||||
DCHECK_GE(lo, 0);
|
||||
DCHECK_GE(hi, 0);
|
||||
DCHECK_LE(lo, 255);
|
||||
DCHECK_LE(hi, 255);
|
||||
DCHECK_LE(lo, hi);
|
||||
if (0 < lo && lo <= 255)
|
||||
byterange_.Set(lo - 1);
|
||||
if (0 <= hi && hi <= 255)
|
||||
byterange_.Set(hi);
|
||||
}
|
||||
|
||||
void Prog::ComputeByteMap() {
|
||||
// Fill in bytemap with byte classes for prog_.
|
||||
// Ranges of bytes that are treated as indistinguishable
|
||||
// by the regexp program are mapped to a single byte class.
|
||||
// The vector prog_->byterange() marks the end of each
|
||||
// such range.
|
||||
const Bitmap<256>& v = byterange();
|
||||
|
||||
COMPILE_ASSERT(8*sizeof(v.Word(0)) == 32, wordsize);
|
||||
uint8 n = 0;
|
||||
uint32 bits = 0;
|
||||
for (int i = 0; i < 256; i++) {
|
||||
if ((i&31) == 0)
|
||||
bits = v.Word(i >> 5);
|
||||
bytemap_[i] = n;
|
||||
n += bits & 1;
|
||||
bits >>= 1;
|
||||
}
|
||||
bytemap_range_ = bytemap_[255] + 1;
|
||||
unbytemap_ = new uint8[bytemap_range_];
|
||||
for (int i = 0; i < 256; i++)
|
||||
unbytemap_[bytemap_[i]] = static_cast<uint8>(i);
|
||||
|
||||
if (0) { // For debugging: use trivial byte map.
|
||||
for (int i = 0; i < 256; i++) {
|
||||
bytemap_[i] = static_cast<uint8>(i);
|
||||
unbytemap_[i] = static_cast<uint8>(i);
|
||||
}
|
||||
bytemap_range_ = 256;
|
||||
LOG(INFO) << "Using trivial bytemap.";
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
381
third_party/re2/re2/prog.h
vendored
381
third_party/re2/re2/prog.h
vendored
@ -1,381 +0,0 @@
|
||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Compiled representation of regular expressions.
|
||||
// See regexp.h for the Regexp class, which represents a regular
|
||||
// expression symbolically.
|
||||
|
||||
#ifndef RE2_PROG_H__
|
||||
#define RE2_PROG_H__
|
||||
|
||||
#include "util/util.h"
|
||||
#include "util/sparse_array.h"
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Simple fixed-size bitmap.
|
||||
template<int Bits>
|
||||
class Bitmap {
|
||||
public:
|
||||
Bitmap() { Reset(); }
|
||||
int Size() { return Bits; }
|
||||
|
||||
void Reset() {
|
||||
for (int i = 0; i < Words; i++)
|
||||
w_[i] = 0;
|
||||
}
|
||||
bool Get(int k) const {
|
||||
return w_[k >> WordLog] & (1<<(k & 31));
|
||||
}
|
||||
void Set(int k) {
|
||||
w_[k >> WordLog] |= 1<<(k & 31);
|
||||
}
|
||||
void Clear(int k) {
|
||||
w_[k >> WordLog] &= ~(1<<(k & 31));
|
||||
}
|
||||
uint32 Word(int i) const {
|
||||
return w_[i];
|
||||
}
|
||||
|
||||
private:
|
||||
static const int WordLog = 5;
|
||||
static const int Words = (Bits+31)/32;
|
||||
uint32 w_[Words];
|
||||
DISALLOW_COPY_AND_ASSIGN(Bitmap);
|
||||
};
|
||||
|
||||
|
||||
// Opcodes for Inst
|
||||
enum InstOp {
|
||||
kInstAlt = 0, // choose between out_ and out1_
|
||||
kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice versa.
|
||||
kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
|
||||
kInstCapture, // capturing parenthesis number cap_
|
||||
kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
|
||||
kInstMatch, // found a match!
|
||||
kInstNop, // no-op; occasionally unavoidable
|
||||
kInstFail, // never match; occasionally unavoidable
|
||||
};
|
||||
|
||||
// Bit flags for empty-width specials
|
||||
enum EmptyOp {
|
||||
kEmptyBeginLine = 1<<0, // ^ - beginning of line
|
||||
kEmptyEndLine = 1<<1, // $ - end of line
|
||||
kEmptyBeginText = 1<<2, // \A - beginning of text
|
||||
kEmptyEndText = 1<<3, // \z - end of text
|
||||
kEmptyWordBoundary = 1<<4, // \b - word boundary
|
||||
kEmptyNonWordBoundary = 1<<5, // \B - not \b
|
||||
kEmptyAllFlags = (1<<6)-1,
|
||||
};
|
||||
|
||||
class Regexp;
|
||||
|
||||
class DFA;
|
||||
struct OneState;
|
||||
|
||||
// Compiled form of regexp program.
|
||||
class Prog {
|
||||
public:
|
||||
Prog();
|
||||
~Prog();
|
||||
|
||||
// Single instruction in regexp program.
|
||||
class Inst {
|
||||
public:
|
||||
Inst() : out_opcode_(0), out1_(0) { }
|
||||
|
||||
// Constructors per opcode
|
||||
void InitAlt(uint32 out, uint32 out1);
|
||||
void InitByteRange(int lo, int hi, int foldcase, uint32 out);
|
||||
void InitCapture(int cap, uint32 out);
|
||||
void InitEmptyWidth(EmptyOp empty, uint32 out);
|
||||
void InitMatch(int id);
|
||||
void InitNop(uint32 out);
|
||||
void InitFail();
|
||||
|
||||
// Getters
|
||||
int id(Prog* p) { return static_cast<int>(this - p->inst_); }
|
||||
InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
|
||||
int out() { return out_opcode_>>3; }
|
||||
int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); return out1_; }
|
||||
int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
|
||||
int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
|
||||
int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
|
||||
int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
|
||||
int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
|
||||
EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
|
||||
bool greedy(Prog *p) {
|
||||
DCHECK_EQ(opcode(), kInstAltMatch);
|
||||
return p->inst(out())->opcode() == kInstByteRange;
|
||||
}
|
||||
|
||||
// Does this inst (an kInstByteRange) match c?
|
||||
inline bool Matches(int c) {
|
||||
DCHECK_EQ(opcode(), kInstByteRange);
|
||||
if (foldcase_ && 'A' <= c && c <= 'Z')
|
||||
c += 'a' - 'A';
|
||||
return lo_ <= c && c <= hi_;
|
||||
}
|
||||
|
||||
// Returns string representation for debugging.
|
||||
string Dump();
|
||||
|
||||
// Maximum instruction id.
|
||||
// (Must fit in out_opcode_, and PatchList steals another bit.)
|
||||
static const int kMaxInst = (1<<28) - 1;
|
||||
|
||||
private:
|
||||
void set_opcode(InstOp opcode) {
|
||||
out_opcode_ = (out()<<3) | opcode;
|
||||
}
|
||||
|
||||
void set_out(int out) {
|
||||
out_opcode_ = (out<<3) | opcode();
|
||||
}
|
||||
|
||||
void set_out_opcode(int out, InstOp opcode) {
|
||||
out_opcode_ = (out<<3) | opcode;
|
||||
}
|
||||
|
||||
uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
|
||||
union { // additional instruction arguments:
|
||||
uint32 out1_; // opcode == kInstAlt
|
||||
// alternate next instruction
|
||||
|
||||
int32 cap_; // opcode == kInstCapture
|
||||
// Index of capture register (holds text
|
||||
// position recorded by capturing parentheses).
|
||||
// For \n (the submatch for the nth parentheses),
|
||||
// the left parenthesis captures into register 2*n
|
||||
// and the right one captures into register 2*n+1.
|
||||
|
||||
int32 match_id_; // opcode == kInstMatch
|
||||
// Match ID to identify this match (for re2::Set).
|
||||
|
||||
struct { // opcode == kInstByteRange
|
||||
uint8 lo_; // byte range is lo_-hi_ inclusive
|
||||
uint8 hi_; //
|
||||
uint8 foldcase_; // convert A-Z to a-z before checking range.
|
||||
};
|
||||
|
||||
EmptyOp empty_; // opcode == kInstEmptyWidth
|
||||
// empty_ is bitwise OR of kEmpty* flags above.
|
||||
};
|
||||
|
||||
friend class Compiler;
|
||||
friend struct PatchList;
|
||||
friend class Prog;
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(Inst);
|
||||
};
|
||||
|
||||
// Whether to anchor the search.
|
||||
enum Anchor {
|
||||
kUnanchored, // match anywhere
|
||||
kAnchored, // match only starting at beginning of text
|
||||
};
|
||||
|
||||
// Kind of match to look for (for anchor != kFullMatch)
|
||||
//
|
||||
// kLongestMatch mode finds the overall longest
|
||||
// match but still makes its submatch choices the way
|
||||
// Perl would, not in the way prescribed by POSIX.
|
||||
// The POSIX rules are much more expensive to implement,
|
||||
// and no one has needed them.
|
||||
//
|
||||
// kFullMatch is not strictly necessary -- we could use
|
||||
// kLongestMatch and then check the length of the match -- but
|
||||
// the matching code can run faster if it knows to consider only
|
||||
// full matches.
|
||||
enum MatchKind {
|
||||
kFirstMatch, // like Perl, PCRE
|
||||
kLongestMatch, // like egrep or POSIX
|
||||
kFullMatch, // match only entire text; implies anchor==kAnchored
|
||||
kManyMatch // for SearchDFA, records set of matches
|
||||
};
|
||||
|
||||
Inst *inst(int id) { return &inst_[id]; }
|
||||
int start() { return start_; }
|
||||
int start_unanchored() { return start_unanchored_; }
|
||||
void set_start(int start) { start_ = start; }
|
||||
void set_start_unanchored(int start) { start_unanchored_ = start; }
|
||||
int size() { return size_; }
|
||||
bool reversed() { return reversed_; }
|
||||
void set_reversed(bool reversed) { reversed_ = reversed; }
|
||||
int byte_inst_count() { return byte_inst_count_; }
|
||||
const Bitmap<256>& byterange() { return byterange_; }
|
||||
void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
|
||||
int64 dfa_mem() { return dfa_mem_; }
|
||||
int flags() { return flags_; }
|
||||
void set_flags(int flags) { flags_ = flags; }
|
||||
bool anchor_start() { return anchor_start_; }
|
||||
void set_anchor_start(bool b) { anchor_start_ = b; }
|
||||
bool anchor_end() { return anchor_end_; }
|
||||
void set_anchor_end(bool b) { anchor_end_ = b; }
|
||||
int bytemap_range() { return bytemap_range_; }
|
||||
const uint8* bytemap() { return bytemap_; }
|
||||
|
||||
// Returns string representation of program for debugging.
|
||||
string Dump();
|
||||
string DumpUnanchored();
|
||||
|
||||
// Record that at some point in the prog, the bytes in the range
|
||||
// lo-hi (inclusive) are treated as different from bytes outside the range.
|
||||
// Tracking this lets the DFA collapse commonly-treated byte ranges
|
||||
// when recording state pointers, greatly reducing its memory footprint.
|
||||
void MarkByteRange(int lo, int hi);
|
||||
|
||||
// Returns the set of kEmpty flags that are in effect at
|
||||
// position p within context.
|
||||
static uint32 EmptyFlags(const StringPiece& context, const char* p);
|
||||
|
||||
// Returns whether byte c is a word character: ASCII only.
|
||||
// Used by the implementation of \b and \B.
|
||||
// This is not right for Unicode, but:
|
||||
// - it's hard to get right in a byte-at-a-time matching world
|
||||
// (the DFA has only one-byte lookahead).
|
||||
// - even if the lookahead were possible, the Progs would be huge.
|
||||
// This crude approximation is the same one PCRE uses.
|
||||
static bool IsWordChar(uint8 c) {
|
||||
return ('A' <= c && c <= 'Z') ||
|
||||
('a' <= c && c <= 'z') ||
|
||||
('0' <= c && c <= '9') ||
|
||||
c == '_';
|
||||
}
|
||||
|
||||
// Execution engines. They all search for the regexp (run the prog)
|
||||
// in text, which is in the larger context (used for ^ $ \b etc).
|
||||
// Anchor and kind control the kind of search.
|
||||
// Returns true if match found, false if not.
|
||||
// If match found, fills match[0..nmatch-1] with submatch info.
|
||||
// match[0] is overall match, match[1] is first set of parens, etc.
|
||||
// If a particular submatch is not matched during the regexp match,
|
||||
// it is set to NULL.
|
||||
//
|
||||
// Matching text == StringPiece(NULL, 0) is treated as any other empty
|
||||
// string, but note that on return, it will not be possible to distinguish
|
||||
// submatches that matched that empty string from submatches that didn't
|
||||
// match anything. Either way, match[i] == NULL.
|
||||
|
||||
// Search using NFA: can find submatches but kind of slow.
|
||||
bool SearchNFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
// Search using DFA: much faster than NFA but only finds
|
||||
// end of match and can use a lot more memory.
|
||||
// Returns whether a match was found.
|
||||
// If the DFA runs out of memory, sets *failed to true and returns false.
|
||||
// If matches != NULL and kind == kManyMatch and there is a match,
|
||||
// SearchDFA fills matches with the match IDs of the final matching state.
|
||||
bool SearchDFA(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match0, bool* failed,
|
||||
vector<int>* matches);
|
||||
|
||||
// Build the entire DFA for the given match kind. FOR TESTING ONLY.
|
||||
// Usually the DFA is built out incrementally, as needed, which
|
||||
// avoids lots of unnecessary work. This function is useful only
|
||||
// for testing purposes. Returns number of states.
|
||||
int BuildEntireDFA(MatchKind kind);
|
||||
|
||||
// Compute byte map.
|
||||
void ComputeByteMap();
|
||||
|
||||
// Run peep-hole optimizer on program.
|
||||
void Optimize();
|
||||
|
||||
// One-pass NFA: only correct if IsOnePass() is true,
|
||||
// but much faster than NFA (competitive with PCRE)
|
||||
// for those expressions.
|
||||
bool IsOnePass();
|
||||
bool SearchOnePass(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
// Bit-state backtracking. Fast on small cases but uses memory
|
||||
// proportional to the product of the program size and the text size.
|
||||
bool SearchBitState(const StringPiece& text, const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
static const int kMaxOnePassCapture = 5; // $0 through $4
|
||||
|
||||
// Backtracking search: the gold standard against which the other
|
||||
// implementations are checked. FOR TESTING ONLY.
|
||||
// It allocates a ton of memory to avoid running forever.
|
||||
// It is also recursive, so can't use in production (will overflow stacks).
|
||||
// The name "Unsafe" here is supposed to be a flag that
|
||||
// you should not be using this function.
|
||||
bool UnsafeSearchBacktrack(const StringPiece& text,
|
||||
const StringPiece& context,
|
||||
Anchor anchor, MatchKind kind,
|
||||
StringPiece* match, int nmatch);
|
||||
|
||||
// Computes range for any strings matching regexp. The min and max can in
|
||||
// some cases be arbitrarily precise, so the caller gets to specify the
|
||||
// maximum desired length of string returned.
|
||||
//
|
||||
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
||||
// string s that is an anchored match for this regexp satisfies
|
||||
// min <= s && s <= max.
|
||||
//
|
||||
// Note that PossibleMatchRange() will only consider the first copy of an
|
||||
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
||||
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
||||
// do not compile down to infinite repetitions.
|
||||
//
|
||||
// Returns true on success, false on error.
|
||||
bool PossibleMatchRange(string* min, string* max, int maxlen);
|
||||
|
||||
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
||||
// Outputs the program fanout into the given sparse array.
|
||||
void Fanout(SparseArray<int>* fanout);
|
||||
|
||||
// Compiles a collection of regexps to Prog. Each regexp will have
|
||||
// its own Match instruction recording the index in the vector.
|
||||
static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
|
||||
Regexp* re);
|
||||
|
||||
private:
|
||||
friend class Compiler;
|
||||
|
||||
DFA* GetDFA(MatchKind kind);
|
||||
|
||||
bool anchor_start_; // regexp has explicit start anchor
|
||||
bool anchor_end_; // regexp has explicit end anchor
|
||||
bool reversed_; // whether program runs backward over input
|
||||
bool did_onepass_; // has IsOnePass been called?
|
||||
|
||||
int start_; // entry point for program
|
||||
int start_unanchored_; // unanchored entry point for program
|
||||
int size_; // number of instructions
|
||||
int byte_inst_count_; // number of kInstByteRange instructions
|
||||
int bytemap_range_; // bytemap_[x] < bytemap_range_
|
||||
int flags_; // regexp parse flags
|
||||
int onepass_statesize_; // byte size of each OneState* node
|
||||
|
||||
Inst* inst_; // pointer to instruction array
|
||||
|
||||
Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
|
||||
DFA* volatile dfa_first_; // DFA cached for kFirstMatch
|
||||
DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
|
||||
int64 dfa_mem_; // Maximum memory for DFAs.
|
||||
void (*delete_dfa_)(DFA* dfa);
|
||||
|
||||
Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
|
||||
// commonly-treated byte range.
|
||||
uint8 bytemap_[256]; // map from input bytes to byte classes
|
||||
uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
|
||||
|
||||
uint8* onepass_nodes_; // data for OnePass nodes
|
||||
OneState* onepass_start_; // start node for OnePass program
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(Prog);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_PROG_H__
|
1239
third_party/re2/re2/re2.cc
vendored
1239
third_party/re2/re2/re2.cc
vendored
File diff suppressed because it is too large
Load Diff
882
third_party/re2/re2/re2.h
vendored
882
third_party/re2/re2/re2.h
vendored
@ -1,882 +0,0 @@
|
||||
// Copyright 2003-2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_RE2_H
|
||||
#define RE2_RE2_H
|
||||
|
||||
// C++ interface to the re2 regular-expression library.
|
||||
// RE2 supports Perl-style regular expressions (with extensions like
|
||||
// \d, \w, \s, ...).
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// REGEXP SYNTAX:
|
||||
//
|
||||
// This module uses the re2 library and hence supports
|
||||
// its syntax for regular expressions, which is similar to Perl's with
|
||||
// some of the more complicated things thrown away. In particular,
|
||||
// backreferences and generalized assertions are not available, nor is \Z.
|
||||
//
|
||||
// See https://github.com/google/re2/wiki/Syntax for the syntax
|
||||
// supported by RE2, and a comparison with PCRE and PERL regexps.
|
||||
//
|
||||
// For those not familiar with Perl's regular expressions,
|
||||
// here are some examples of the most commonly used extensions:
|
||||
//
|
||||
// "hello (\\w+) world" -- \w matches a "word" character
|
||||
// "version (\\d+)" -- \d matches a digit
|
||||
// "hello\\s+world" -- \s matches any whitespace character
|
||||
// "\\b(\\w+)\\b" -- \b matches non-empty string at word boundary
|
||||
// "(?i)hello" -- (?i) turns on case-insensitive matching
|
||||
// "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// MATCHING INTERFACE:
|
||||
//
|
||||
// The "FullMatch" operation checks that supplied text matches a
|
||||
// supplied pattern exactly.
|
||||
//
|
||||
// Example: successful match
|
||||
// CHECK(RE2::FullMatch("hello", "h.*o"));
|
||||
//
|
||||
// Example: unsuccessful match (requires full match):
|
||||
// CHECK(!RE2::FullMatch("hello", "e"));
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// UTF-8 AND THE MATCHING INTERFACE:
|
||||
//
|
||||
// By default, the pattern and input text are interpreted as UTF-8.
|
||||
// The RE2::Latin1 option causes them to be interpreted as Latin-1.
|
||||
//
|
||||
// Example:
|
||||
// CHECK(RE2::FullMatch(utf8_string, RE2(utf8_pattern)));
|
||||
// CHECK(RE2::FullMatch(latin1_string, RE2(latin1_pattern, RE2::Latin1)));
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// MATCHING WITH SUB-STRING EXTRACTION:
|
||||
//
|
||||
// You can supply extra pointer arguments to extract matched subpieces.
|
||||
//
|
||||
// Example: extracts "ruby" into "s" and 1234 into "i"
|
||||
// int i;
|
||||
// string s;
|
||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
|
||||
//
|
||||
// Example: fails because string cannot be stored in integer
|
||||
// CHECK(!RE2::FullMatch("ruby", "(.*)", &i));
|
||||
//
|
||||
// Example: fails because there aren't enough sub-patterns:
|
||||
// CHECK(!RE2::FullMatch("ruby:1234", "\\w+:\\d+", &s));
|
||||
//
|
||||
// Example: does not try to extract any extra sub-patterns
|
||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s));
|
||||
//
|
||||
// Example: does not try to extract into NULL
|
||||
// CHECK(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", NULL, &i));
|
||||
//
|
||||
// Example: integer overflow causes failure
|
||||
// CHECK(!RE2::FullMatch("ruby:1234567891234", "\\w+:(\\d+)", &i));
|
||||
//
|
||||
// NOTE(rsc): Asking for substrings slows successful matches quite a bit.
|
||||
// This may get a little faster in the future, but right now is slower
|
||||
// than PCRE. On the other hand, failed matches run *very* fast (faster
|
||||
// than PCRE), as do matches without substring extraction.
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// PARTIAL MATCHES
|
||||
//
|
||||
// You can use the "PartialMatch" operation when you want the pattern
|
||||
// to match any substring of the text.
|
||||
//
|
||||
// Example: simple search for a string:
|
||||
// CHECK(RE2::PartialMatch("hello", "ell"));
|
||||
//
|
||||
// Example: find first number in a string
|
||||
// int number;
|
||||
// CHECK(RE2::PartialMatch("x*100 + 20", "(\\d+)", &number));
|
||||
// CHECK_EQ(number, 100);
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// PRE-COMPILED REGULAR EXPRESSIONS
|
||||
//
|
||||
// RE2 makes it easy to use any string as a regular expression, without
|
||||
// requiring a separate compilation step.
|
||||
//
|
||||
// If speed is of the essence, you can create a pre-compiled "RE2"
|
||||
// object from the pattern and use it multiple times. If you do so,
|
||||
// you can typically parse text faster than with sscanf.
|
||||
//
|
||||
// Example: precompile pattern for faster matching:
|
||||
// RE2 pattern("h.*o");
|
||||
// while (ReadLine(&str)) {
|
||||
// if (RE2::FullMatch(str, pattern)) ...;
|
||||
// }
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// SCANNING TEXT INCREMENTALLY
|
||||
//
|
||||
// The "Consume" operation may be useful if you want to repeatedly
|
||||
// match regular expressions at the front of a string and skip over
|
||||
// them as they match. This requires use of the "StringPiece" type,
|
||||
// which represents a sub-range of a real string.
|
||||
//
|
||||
// Example: read lines of the form "var = value" from a string.
|
||||
// string contents = ...; // Fill string somehow
|
||||
// StringPiece input(contents); // Wrap a StringPiece around it
|
||||
//
|
||||
// string var;
|
||||
// int value;
|
||||
// while (RE2::Consume(&input, "(\\w+) = (\\d+)\n", &var, &value)) {
|
||||
// ...;
|
||||
// }
|
||||
//
|
||||
// Each successful call to "Consume" will set "var/value", and also
|
||||
// advance "input" so it points past the matched text. Note that if the
|
||||
// regular expression matches an empty string, input will advance
|
||||
// by 0 bytes. If the regular expression being used might match
|
||||
// an empty string, the loop body must check for this case and either
|
||||
// advance the string or break out of the loop.
|
||||
//
|
||||
// The "FindAndConsume" operation is similar to "Consume" but does not
|
||||
// anchor your match at the beginning of the string. For example, you
|
||||
// could extract all words from a string by repeatedly calling
|
||||
// RE2::FindAndConsume(&input, "(\\w+)", &word)
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// USING VARIABLE NUMBER OF ARGUMENTS
|
||||
//
|
||||
// The above operations require you to know the number of arguments
|
||||
// when you write the code. This is not always possible or easy (for
|
||||
// example, the regular expression may be calculated at run time).
|
||||
// You can use the "N" version of the operations when the number of
|
||||
// match arguments are determined at run time.
|
||||
//
|
||||
// Example:
|
||||
// const RE2::Arg* args[10];
|
||||
// int n;
|
||||
// // ... populate args with pointers to RE2::Arg values ...
|
||||
// // ... set n to the number of RE2::Arg objects ...
|
||||
// bool match = RE2::FullMatchN(input, pattern, args, n);
|
||||
//
|
||||
// The last statement is equivalent to
|
||||
//
|
||||
// bool match = RE2::FullMatch(input, pattern,
|
||||
// *args[0], *args[1], ..., *args[n - 1]);
|
||||
//
|
||||
// -----------------------------------------------------------------------
|
||||
// PARSING HEX/OCTAL/C-RADIX NUMBERS
|
||||
//
|
||||
// By default, if you pass a pointer to a numeric value, the
|
||||
// corresponding text is interpreted as a base-10 number. You can
|
||||
// instead wrap the pointer with a call to one of the operators Hex(),
|
||||
// Octal(), or CRadix() to interpret the text in another base. The
|
||||
// CRadix operator interprets C-style "0" (base-8) and "0x" (base-16)
|
||||
// prefixes, but defaults to base-10.
|
||||
//
|
||||
// Example:
|
||||
// int a, b, c, d;
|
||||
// CHECK(RE2::FullMatch("100 40 0100 0x40", "(.*) (.*) (.*) (.*)",
|
||||
// RE2::Octal(&a), RE2::Hex(&b), RE2::CRadix(&c), RE2::CRadix(&d));
|
||||
// will leave 64 in a, b, c, and d.
|
||||
|
||||
#include <stdint.h>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/variadic_function.h"
|
||||
|
||||
#ifndef RE2_HAVE_LONGLONG
|
||||
#define RE2_HAVE_LONGLONG 1
|
||||
#endif
|
||||
|
||||
namespace re2 {
|
||||
|
||||
using std::string;
|
||||
using std::map;
|
||||
class Mutex;
|
||||
class Prog;
|
||||
class Regexp;
|
||||
|
||||
// The following enum should be used only as a constructor argument to indicate
|
||||
// that the variable has static storage class, and that the constructor should
|
||||
// do nothing to its state. It indicates to the reader that it is legal to
|
||||
// declare a static instance of the class, provided the constructor is given
|
||||
// the LINKER_INITIALIZED argument. Normally, it is unsafe to declare a
|
||||
// static variable that has a constructor or a destructor because invocation
|
||||
// order is undefined. However, IF the type can be initialized by filling with
|
||||
// zeroes (which the loader does for static variables), AND the type's
|
||||
// destructor does nothing to the storage, then a constructor for static
|
||||
// initialization can be declared as
|
||||
// explicit MyClass(LinkerInitialized x) {}
|
||||
// and invoked as
|
||||
// static MyClass my_variable_name(LINKER_INITIALIZED);
|
||||
enum LinkerInitialized { LINKER_INITIALIZED };
|
||||
|
||||
// Interface for regular expression matching. Also corresponds to a
|
||||
// pre-compiled regular expression. An "RE2" object is safe for
|
||||
// concurrent use by multiple threads.
|
||||
class RE2 {
|
||||
public:
|
||||
// We convert user-passed pointers into special Arg objects
|
||||
class Arg;
|
||||
class Options;
|
||||
|
||||
// Defined in set.h.
|
||||
class Set;
|
||||
|
||||
enum ErrorCode {
|
||||
NoError = 0,
|
||||
|
||||
// Unexpected error
|
||||
ErrorInternal,
|
||||
|
||||
// Parse errors
|
||||
ErrorBadEscape, // bad escape sequence
|
||||
ErrorBadCharClass, // bad character class
|
||||
ErrorBadCharRange, // bad character class range
|
||||
ErrorMissingBracket, // missing closing ]
|
||||
ErrorMissingParen, // missing closing )
|
||||
ErrorTrailingBackslash, // trailing \ at end of regexp
|
||||
ErrorRepeatArgument, // repeat argument missing, e.g. "*"
|
||||
ErrorRepeatSize, // bad repetition argument
|
||||
ErrorRepeatOp, // bad repetition operator
|
||||
ErrorBadPerlOp, // bad perl operator
|
||||
ErrorBadUTF8, // invalid UTF-8 in regexp
|
||||
ErrorBadNamedCapture, // bad named capture group
|
||||
ErrorPatternTooLarge // pattern too large (compile failed)
|
||||
};
|
||||
|
||||
// Predefined common options.
|
||||
// If you need more complicated things, instantiate
|
||||
// an Option class, possibly passing one of these to
|
||||
// the Option constructor, change the settings, and pass that
|
||||
// Option class to the RE2 constructor.
|
||||
enum CannedOptions {
|
||||
DefaultOptions = 0,
|
||||
Latin1, // treat input as Latin-1 (default UTF-8)
|
||||
POSIX, // POSIX syntax, leftmost-longest match
|
||||
Quiet // do not log about regexp parse errors
|
||||
};
|
||||
|
||||
// Need to have the const char* and const string& forms for implicit
|
||||
// conversions when passing string literals to FullMatch and PartialMatch.
|
||||
// Otherwise the StringPiece form would be sufficient.
|
||||
#ifndef SWIG
|
||||
RE2(const char* pattern);
|
||||
RE2(const string& pattern);
|
||||
#endif
|
||||
RE2(const StringPiece& pattern);
|
||||
RE2(const StringPiece& pattern, const Options& option);
|
||||
~RE2();
|
||||
|
||||
// Returns whether RE2 was created properly.
|
||||
bool ok() const { return error_code() == NoError; }
|
||||
|
||||
// The string specification for this RE2. E.g.
|
||||
// RE2 re("ab*c?d+");
|
||||
// re.pattern(); // "ab*c?d+"
|
||||
const string& pattern() const { return pattern_; }
|
||||
|
||||
// If RE2 could not be created properly, returns an error string.
|
||||
// Else returns the empty string.
|
||||
const string& error() const { return *error_; }
|
||||
|
||||
// If RE2 could not be created properly, returns an error code.
|
||||
// Else returns RE2::NoError (== 0).
|
||||
ErrorCode error_code() const { return error_code_; }
|
||||
|
||||
// If RE2 could not be created properly, returns the offending
|
||||
// portion of the regexp.
|
||||
const string& error_arg() const { return error_arg_; }
|
||||
|
||||
// Returns the program size, a very approximate measure of a regexp's "cost".
|
||||
// Larger numbers are more expensive than smaller numbers.
|
||||
int ProgramSize() const;
|
||||
|
||||
// EXPERIMENTAL! SUBJECT TO CHANGE!
|
||||
// Outputs the program fanout as a histogram bucketed by powers of 2.
|
||||
// Returns the number of the largest non-empty bucket.
|
||||
int ProgramFanout(map<int, int>* histogram) const;
|
||||
|
||||
// Returns the underlying Regexp; not for general use.
|
||||
// Returns entire_regexp_ so that callers don't need
|
||||
// to know about prefix_ and prefix_foldcase_.
|
||||
re2::Regexp* Regexp() const { return entire_regexp_; }
|
||||
|
||||
/***** The useful part: the matching interface *****/
|
||||
|
||||
// Matches "text" against "pattern". If pointer arguments are
|
||||
// supplied, copies matched sub-patterns into them.
|
||||
//
|
||||
// You can pass in a "const char*" or a "string" for "text".
|
||||
// You can pass in a "const char*" or a "string" or a "RE2" for "pattern".
|
||||
//
|
||||
// The provided pointer arguments can be pointers to any scalar numeric
|
||||
// type, or one of:
|
||||
// string (matched piece is copied to string)
|
||||
// StringPiece (StringPiece is mutated to point to matched piece)
|
||||
// T (where "bool T::ParseFrom(const char*, int)" exists)
|
||||
// (void*)NULL (the corresponding matched sub-pattern is not copied)
|
||||
//
|
||||
// Returns true iff all of the following conditions are satisfied:
|
||||
// a. "text" matches "pattern" exactly
|
||||
// b. The number of matched sub-patterns is >= number of supplied pointers
|
||||
// c. The "i"th argument has a suitable type for holding the
|
||||
// string captured as the "i"th sub-pattern. If you pass in
|
||||
// NULL for the "i"th argument, or pass fewer arguments than
|
||||
// number of sub-patterns, "i"th captured sub-pattern is
|
||||
// ignored.
|
||||
//
|
||||
// CAVEAT: An optional sub-pattern that does not exist in the
|
||||
// matched string is assigned the empty string. Therefore, the
|
||||
// following will return false (because the empty string is not a
|
||||
// valid number):
|
||||
// int number;
|
||||
// RE2::FullMatch("abc", "[a-z]+(\\d+)?", &number);
|
||||
static bool FullMatchN(const StringPiece& text, const RE2& re,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, const StringPiece&, const RE2&, Arg, RE2::FullMatchN> FullMatch;
|
||||
|
||||
// Exactly like FullMatch(), except that "pattern" is allowed to match
|
||||
// a substring of "text".
|
||||
static bool PartialMatchN(const StringPiece& text, const RE2& re, // 3..16 args
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, const StringPiece&, const RE2&, Arg, RE2::PartialMatchN> PartialMatch;
|
||||
|
||||
// Like FullMatch() and PartialMatch(), except that pattern has to
|
||||
// match a prefix of "text", and "input" is advanced past the matched
|
||||
// text. Note: "input" is modified iff this routine returns true.
|
||||
static bool ConsumeN(StringPiece* input, const RE2& pattern, // 3..16 args
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, StringPiece*, const RE2&, Arg, RE2::ConsumeN> Consume;
|
||||
|
||||
// Like Consume(..), but does not anchor the match at the beginning of the
|
||||
// string. That is, "pattern" need not start its match at the beginning of
|
||||
// "input". For example, "FindAndConsume(s, "(\\w+)", &word)" finds the next
|
||||
// word in "s" and stores it in "word".
|
||||
static bool FindAndConsumeN(StringPiece* input, const RE2& pattern,
|
||||
const Arg* const args[], int argc);
|
||||
static const VariadicFunction2<
|
||||
bool, StringPiece*, const RE2&, Arg, RE2::FindAndConsumeN> FindAndConsume;
|
||||
|
||||
// Replace the first match of "pattern" in "str" with "rewrite".
|
||||
// Within "rewrite", backslash-escaped digits (\1 to \9) can be
|
||||
// used to insert text matching corresponding parenthesized group
|
||||
// from the pattern. \0 in "rewrite" refers to the entire matching
|
||||
// text. E.g.,
|
||||
//
|
||||
// string s = "yabba dabba doo";
|
||||
// CHECK(RE2::Replace(&s, "b+", "d"));
|
||||
//
|
||||
// will leave "s" containing "yada dabba doo"
|
||||
//
|
||||
// Returns true if the pattern matches and a replacement occurs,
|
||||
// false otherwise.
|
||||
static bool Replace(string *str,
|
||||
const RE2& pattern,
|
||||
const StringPiece& rewrite);
|
||||
|
||||
// Like Replace(), except replaces successive non-overlapping occurrences
|
||||
// of the pattern in the string with the rewrite. E.g.
|
||||
//
|
||||
// string s = "yabba dabba doo";
|
||||
// CHECK(RE2::GlobalReplace(&s, "b+", "d"));
|
||||
//
|
||||
// will leave "s" containing "yada dada doo"
|
||||
// Replacements are not subject to re-matching.
|
||||
//
|
||||
// Because GlobalReplace only replaces non-overlapping matches,
|
||||
// replacing "ana" within "banana" makes only one replacement, not two.
|
||||
//
|
||||
// Returns the number of replacements made.
|
||||
static int GlobalReplace(string *str,
|
||||
const RE2& pattern,
|
||||
const StringPiece& rewrite);
|
||||
|
||||
// Like Replace, except that if the pattern matches, "rewrite"
|
||||
// is copied into "out" with substitutions. The non-matching
|
||||
// portions of "text" are ignored.
|
||||
//
|
||||
// Returns true iff a match occurred and the extraction happened
|
||||
// successfully; if no match occurs, the string is left unaffected.
|
||||
//
|
||||
// REQUIRES: "text" must not alias any part of "*out".
|
||||
static bool Extract(const StringPiece &text,
|
||||
const RE2& pattern,
|
||||
const StringPiece &rewrite,
|
||||
string *out);
|
||||
|
||||
// Escapes all potentially meaningful regexp characters in
|
||||
// 'unquoted'. The returned string, used as a regular expression,
|
||||
// will exactly match the original string. For example,
|
||||
// 1.5-2.0?
|
||||
// may become:
|
||||
// 1\.5\-2\.0\?
|
||||
static string QuoteMeta(const StringPiece& unquoted);
|
||||
|
||||
// Computes range for any strings matching regexp. The min and max can in
|
||||
// some cases be arbitrarily precise, so the caller gets to specify the
|
||||
// maximum desired length of string returned.
|
||||
//
|
||||
// Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
|
||||
// string s that is an anchored match for this regexp satisfies
|
||||
// min <= s && s <= max.
|
||||
//
|
||||
// Note that PossibleMatchRange() will only consider the first copy of an
|
||||
// infinitely repeated element (i.e., any regexp element followed by a '*' or
|
||||
// '+' operator). Regexps with "{N}" constructions are not affected, as those
|
||||
// do not compile down to infinite repetitions.
|
||||
//
|
||||
// Returns true on success, false on error.
|
||||
bool PossibleMatchRange(string* min, string* max, int maxlen) const;
|
||||
|
||||
// Generic matching interface
|
||||
|
||||
// Type of match.
|
||||
enum Anchor {
|
||||
UNANCHORED, // No anchoring
|
||||
ANCHOR_START, // Anchor at start only
|
||||
ANCHOR_BOTH // Anchor at start and end
|
||||
};
|
||||
|
||||
// Return the number of capturing subpatterns, or -1 if the
|
||||
// regexp wasn't valid on construction. The overall match ($0)
|
||||
// does not count: if the regexp is "(a)(b)", returns 2.
|
||||
int NumberOfCapturingGroups() const;
|
||||
|
||||
// Return a map from names to capturing indices.
|
||||
// The map records the index of the leftmost group
|
||||
// with the given name.
|
||||
// Only valid until the re is deleted.
|
||||
const map<string, int>& NamedCapturingGroups() const;
|
||||
|
||||
// Return a map from capturing indices to names.
|
||||
// The map has no entries for unnamed groups.
|
||||
// Only valid until the re is deleted.
|
||||
const map<int, string>& CapturingGroupNames() const;
|
||||
|
||||
// General matching routine.
|
||||
// Match against text starting at offset startpos
|
||||
// and stopping the search at offset endpos.
|
||||
// Returns true if match found, false if not.
|
||||
// On a successful match, fills in match[] (up to nmatch entries)
|
||||
// with information about submatches.
|
||||
// I.e. matching RE2("(foo)|(bar)baz") on "barbazbla" will return true,
|
||||
// setting match[0] = "barbaz", match[1] = NULL, match[2] = "bar",
|
||||
// match[3] = NULL, ..., up to match[nmatch-1] = NULL.
|
||||
//
|
||||
// Don't ask for more match information than you will use:
|
||||
// runs much faster with nmatch == 1 than nmatch > 1, and
|
||||
// runs even faster if nmatch == 0.
|
||||
// Doesn't make sense to use nmatch > 1 + NumberOfCapturingGroups(),
|
||||
// but will be handled correctly.
|
||||
//
|
||||
// Passing text == StringPiece(NULL, 0) will be handled like any other
|
||||
// empty string, but note that on return, it will not be possible to tell
|
||||
// whether submatch i matched the empty string or did not match:
|
||||
// either way, match[i] == NULL.
|
||||
bool Match(const StringPiece& text,
|
||||
int startpos,
|
||||
int endpos,
|
||||
Anchor anchor,
|
||||
StringPiece *match,
|
||||
int nmatch) const;
|
||||
|
||||
// Check that the given rewrite string is suitable for use with this
|
||||
// regular expression. It checks that:
|
||||
// * The regular expression has enough parenthesized subexpressions
|
||||
// to satisfy all of the \N tokens in rewrite
|
||||
// * The rewrite string doesn't have any syntax errors. E.g.,
|
||||
// '\' followed by anything other than a digit or '\'.
|
||||
// A true return value guarantees that Replace() and Extract() won't
|
||||
// fail because of a bad rewrite string.
|
||||
bool CheckRewriteString(const StringPiece& rewrite, string* error) const;
|
||||
|
||||
// Returns the maximum submatch needed for the rewrite to be done by
|
||||
// Replace(). E.g. if rewrite == "foo \\2,\\1", returns 2.
|
||||
static int MaxSubmatch(const StringPiece& rewrite);
|
||||
|
||||
// Append the "rewrite" string, with backslash subsitutions from "vec",
|
||||
// to string "out".
|
||||
// Returns true on success. This method can fail because of a malformed
|
||||
// rewrite string. CheckRewriteString guarantees that the rewrite will
|
||||
// be sucessful.
|
||||
bool Rewrite(string *out,
|
||||
const StringPiece &rewrite,
|
||||
const StringPiece* vec,
|
||||
int veclen) const;
|
||||
|
||||
// Constructor options
|
||||
class Options {
|
||||
public:
|
||||
// The options are (defaults in parentheses):
|
||||
//
|
||||
// utf8 (true) text and pattern are UTF-8; otherwise Latin-1
|
||||
// posix_syntax (false) restrict regexps to POSIX egrep syntax
|
||||
// longest_match (false) search for longest match, not first match
|
||||
// log_errors (true) log syntax and execution errors to ERROR
|
||||
// max_mem (see below) approx. max memory footprint of RE2
|
||||
// literal (false) interpret string as literal, not regexp
|
||||
// never_nl (false) never match \n, even if it is in regexp
|
||||
// dot_nl (false) dot matches everything including new line
|
||||
// never_capture (false) parse all parens as non-capturing
|
||||
// case_sensitive (true) match is case-sensitive (regexp can override
|
||||
// with (?i) unless in posix_syntax mode)
|
||||
//
|
||||
// The following options are only consulted when posix_syntax == true.
|
||||
// (When posix_syntax == false these features are always enabled and
|
||||
// cannot be turned off.)
|
||||
// perl_classes (false) allow Perl's \d \s \w \D \S \W
|
||||
// word_boundary (false) allow Perl's \b \B (word boundary and not)
|
||||
// one_line (false) ^ and $ only match beginning and end of text
|
||||
//
|
||||
// The max_mem option controls how much memory can be used
|
||||
// to hold the compiled form of the regexp (the Prog) and
|
||||
// its cached DFA graphs. Code Search placed limits on the number
|
||||
// of Prog instructions and DFA states: 10,000 for both.
|
||||
// In RE2, those limits would translate to about 240 KB per Prog
|
||||
// and perhaps 2.5 MB per DFA (DFA state sizes vary by regexp; RE2 does a
|
||||
// better job of keeping them small than Code Search did).
|
||||
// Each RE2 has two Progs (one forward, one reverse), and each Prog
|
||||
// can have two DFAs (one first match, one longest match).
|
||||
// That makes 4 DFAs:
|
||||
//
|
||||
// forward, first-match - used for UNANCHORED or ANCHOR_LEFT searches
|
||||
// if opt.longest_match() == false
|
||||
// forward, longest-match - used for all ANCHOR_BOTH searches,
|
||||
// and the other two kinds if
|
||||
// opt.longest_match() == true
|
||||
// reverse, first-match - never used
|
||||
// reverse, longest-match - used as second phase for unanchored searches
|
||||
//
|
||||
// The RE2 memory budget is statically divided between the two
|
||||
// Progs and then the DFAs: two thirds to the forward Prog
|
||||
// and one third to the reverse Prog. The forward Prog gives half
|
||||
// of what it has left over to each of its DFAs. The reverse Prog
|
||||
// gives it all to its longest-match DFA.
|
||||
//
|
||||
// Once a DFA fills its budget, it flushes its cache and starts over.
|
||||
// If this happens too often, RE2 falls back on the NFA implementation.
|
||||
|
||||
// For now, make the default budget something close to Code Search.
|
||||
static const int kDefaultMaxMem = 8<<20;
|
||||
|
||||
enum Encoding {
|
||||
EncodingUTF8 = 1,
|
||||
EncodingLatin1
|
||||
};
|
||||
|
||||
Options() :
|
||||
encoding_(EncodingUTF8),
|
||||
posix_syntax_(false),
|
||||
longest_match_(false),
|
||||
log_errors_(true),
|
||||
max_mem_(kDefaultMaxMem),
|
||||
literal_(false),
|
||||
never_nl_(false),
|
||||
dot_nl_(false),
|
||||
never_capture_(false),
|
||||
case_sensitive_(true),
|
||||
perl_classes_(false),
|
||||
word_boundary_(false),
|
||||
one_line_(false) {
|
||||
}
|
||||
|
||||
/*implicit*/ Options(CannedOptions);
|
||||
|
||||
Encoding encoding() const { return encoding_; }
|
||||
void set_encoding(Encoding encoding) { encoding_ = encoding; }
|
||||
|
||||
// Legacy interface to encoding.
|
||||
// TODO(rsc): Remove once clients have been converted.
|
||||
bool utf8() const { return encoding_ == EncodingUTF8; }
|
||||
void set_utf8(bool b) {
|
||||
if (b) {
|
||||
encoding_ = EncodingUTF8;
|
||||
} else {
|
||||
encoding_ = EncodingLatin1;
|
||||
}
|
||||
}
|
||||
|
||||
bool posix_syntax() const { return posix_syntax_; }
|
||||
void set_posix_syntax(bool b) { posix_syntax_ = b; }
|
||||
|
||||
bool longest_match() const { return longest_match_; }
|
||||
void set_longest_match(bool b) { longest_match_ = b; }
|
||||
|
||||
bool log_errors() const { return log_errors_; }
|
||||
void set_log_errors(bool b) { log_errors_ = b; }
|
||||
|
||||
int64_t max_mem() const { return max_mem_; }
|
||||
void set_max_mem(int64_t m) { max_mem_ = m; }
|
||||
|
||||
bool literal() const { return literal_; }
|
||||
void set_literal(bool b) { literal_ = b; }
|
||||
|
||||
bool never_nl() const { return never_nl_; }
|
||||
void set_never_nl(bool b) { never_nl_ = b; }
|
||||
|
||||
bool dot_nl() const { return dot_nl_; }
|
||||
void set_dot_nl(bool b) { dot_nl_ = b; }
|
||||
|
||||
bool never_capture() const { return never_capture_; }
|
||||
void set_never_capture(bool b) { never_capture_ = b; }
|
||||
|
||||
bool case_sensitive() const { return case_sensitive_; }
|
||||
void set_case_sensitive(bool b) { case_sensitive_ = b; }
|
||||
|
||||
bool perl_classes() const { return perl_classes_; }
|
||||
void set_perl_classes(bool b) { perl_classes_ = b; }
|
||||
|
||||
bool word_boundary() const { return word_boundary_; }
|
||||
void set_word_boundary(bool b) { word_boundary_ = b; }
|
||||
|
||||
bool one_line() const { return one_line_; }
|
||||
void set_one_line(bool b) { one_line_ = b; }
|
||||
|
||||
void Copy(const Options& src) {
|
||||
encoding_ = src.encoding_;
|
||||
posix_syntax_ = src.posix_syntax_;
|
||||
longest_match_ = src.longest_match_;
|
||||
log_errors_ = src.log_errors_;
|
||||
max_mem_ = src.max_mem_;
|
||||
literal_ = src.literal_;
|
||||
never_nl_ = src.never_nl_;
|
||||
dot_nl_ = src.dot_nl_;
|
||||
never_capture_ = src.never_capture_;
|
||||
case_sensitive_ = src.case_sensitive_;
|
||||
perl_classes_ = src.perl_classes_;
|
||||
word_boundary_ = src.word_boundary_;
|
||||
one_line_ = src.one_line_;
|
||||
}
|
||||
|
||||
int ParseFlags() const;
|
||||
|
||||
private:
|
||||
Encoding encoding_;
|
||||
bool posix_syntax_;
|
||||
bool longest_match_;
|
||||
bool log_errors_;
|
||||
int64_t max_mem_;
|
||||
bool literal_;
|
||||
bool never_nl_;
|
||||
bool dot_nl_;
|
||||
bool never_capture_;
|
||||
bool case_sensitive_;
|
||||
bool perl_classes_;
|
||||
bool word_boundary_;
|
||||
bool one_line_;
|
||||
|
||||
//DISALLOW_COPY_AND_ASSIGN(Options);
|
||||
Options(const Options&);
|
||||
void operator=(const Options&);
|
||||
};
|
||||
|
||||
// Returns the options set in the constructor.
|
||||
const Options& options() const { return options_; };
|
||||
|
||||
// Argument converters; see below.
|
||||
static inline Arg CRadix(short* x);
|
||||
static inline Arg CRadix(unsigned short* x);
|
||||
static inline Arg CRadix(int* x);
|
||||
static inline Arg CRadix(unsigned int* x);
|
||||
static inline Arg CRadix(long* x);
|
||||
static inline Arg CRadix(unsigned long* x);
|
||||
#if RE2_HAVE_LONGLONG
|
||||
static inline Arg CRadix(long long* x);
|
||||
static inline Arg CRadix(unsigned long long* x);
|
||||
#endif
|
||||
|
||||
static inline Arg Hex(short* x);
|
||||
static inline Arg Hex(unsigned short* x);
|
||||
static inline Arg Hex(int* x);
|
||||
static inline Arg Hex(unsigned int* x);
|
||||
static inline Arg Hex(long* x);
|
||||
static inline Arg Hex(unsigned long* x);
|
||||
#if RE2_HAVE_LONGLONG
|
||||
static inline Arg Hex(long long* x);
|
||||
static inline Arg Hex(unsigned long long* x);
|
||||
#endif
|
||||
|
||||
static inline Arg Octal(short* x);
|
||||
static inline Arg Octal(unsigned short* x);
|
||||
static inline Arg Octal(int* x);
|
||||
static inline Arg Octal(unsigned int* x);
|
||||
static inline Arg Octal(long* x);
|
||||
static inline Arg Octal(unsigned long* x);
|
||||
#if RE2_HAVE_LONGLONG
|
||||
static inline Arg Octal(long long* x);
|
||||
static inline Arg Octal(unsigned long long* x);
|
||||
#endif
|
||||
|
||||
private:
|
||||
void Init(const StringPiece& pattern, const Options& options);
|
||||
|
||||
bool DoMatch(const StringPiece& text,
|
||||
Anchor anchor,
|
||||
int* consumed,
|
||||
const Arg* const args[],
|
||||
int n) const;
|
||||
|
||||
re2::Prog* ReverseProg() const;
|
||||
|
||||
mutable Mutex* mutex_;
|
||||
string pattern_; // string regular expression
|
||||
Options options_; // option flags
|
||||
string prefix_; // required prefix (before regexp_)
|
||||
bool prefix_foldcase_; // prefix is ASCII case-insensitive
|
||||
re2::Regexp* entire_regexp_; // parsed regular expression
|
||||
re2::Regexp* suffix_regexp_; // parsed regular expression, prefix removed
|
||||
re2::Prog* prog_; // compiled program for regexp
|
||||
mutable re2::Prog* rprog_; // reverse program for regexp
|
||||
bool is_one_pass_; // can use prog_->SearchOnePass?
|
||||
mutable const string* error_; // Error indicator
|
||||
// (or points to empty string)
|
||||
mutable ErrorCode error_code_; // Error code
|
||||
mutable string error_arg_; // Fragment of regexp showing error
|
||||
mutable int num_captures_; // Number of capturing groups
|
||||
|
||||
// Map from capture names to indices
|
||||
mutable const map<string, int>* named_groups_;
|
||||
|
||||
// Map from capture indices to names
|
||||
mutable const map<int, string>* group_names_;
|
||||
|
||||
//DISALLOW_COPY_AND_ASSIGN(RE2);
|
||||
RE2(const RE2&);
|
||||
void operator=(const RE2&);
|
||||
};
|
||||
|
||||
/***** Implementation details *****/
|
||||
|
||||
// Hex/Octal/Binary?
|
||||
|
||||
// Special class for parsing into objects that define a ParseFrom() method
|
||||
template <class T>
|
||||
class _RE2_MatchObject {
|
||||
public:
|
||||
static inline bool Parse(const char* str, int n, void* dest) {
|
||||
if (dest == NULL) return true;
|
||||
T* object = reinterpret_cast<T*>(dest);
|
||||
return object->ParseFrom(str, n);
|
||||
}
|
||||
};
|
||||
|
||||
class RE2::Arg {
|
||||
public:
|
||||
// Empty constructor so we can declare arrays of RE2::Arg
|
||||
Arg();
|
||||
|
||||
// Constructor specially designed for NULL arguments
|
||||
Arg(void*);
|
||||
|
||||
typedef bool (*Parser)(const char* str, int n, void* dest);
|
||||
|
||||
// Type-specific parsers
|
||||
#define MAKE_PARSER(type,name) \
|
||||
Arg(type* p) : arg_(p), parser_(name) { } \
|
||||
Arg(type* p, Parser parser) : arg_(p), parser_(parser) { } \
|
||||
|
||||
|
||||
MAKE_PARSER(char, parse_char);
|
||||
MAKE_PARSER(signed char, parse_char);
|
||||
MAKE_PARSER(unsigned char, parse_uchar);
|
||||
MAKE_PARSER(short, parse_short);
|
||||
MAKE_PARSER(unsigned short, parse_ushort);
|
||||
MAKE_PARSER(int, parse_int);
|
||||
MAKE_PARSER(unsigned int, parse_uint);
|
||||
MAKE_PARSER(long, parse_long);
|
||||
MAKE_PARSER(unsigned long, parse_ulong);
|
||||
#if RE2_HAVE_LONGLONG
|
||||
MAKE_PARSER(long long, parse_longlong);
|
||||
MAKE_PARSER(unsigned long long, parse_ulonglong);
|
||||
#endif
|
||||
MAKE_PARSER(float, parse_float);
|
||||
MAKE_PARSER(double, parse_double);
|
||||
MAKE_PARSER(string, parse_string);
|
||||
MAKE_PARSER(StringPiece, parse_stringpiece);
|
||||
|
||||
#undef MAKE_PARSER
|
||||
|
||||
// Generic constructor templates
|
||||
template <class T> Arg(T* p)
|
||||
: arg_(p), parser_(_RE2_MatchObject<T>::Parse) { }
|
||||
template <class T> Arg(T* p, Parser parser)
|
||||
: arg_(p), parser_(parser) { }
|
||||
|
||||
// Parse the data
|
||||
bool Parse(const char* str, int n) const;
|
||||
|
||||
private:
|
||||
void* arg_;
|
||||
Parser parser_;
|
||||
|
||||
static bool parse_null (const char* str, int n, void* dest);
|
||||
static bool parse_char (const char* str, int n, void* dest);
|
||||
static bool parse_uchar (const char* str, int n, void* dest);
|
||||
static bool parse_float (const char* str, int n, void* dest);
|
||||
static bool parse_double (const char* str, int n, void* dest);
|
||||
static bool parse_string (const char* str, int n, void* dest);
|
||||
static bool parse_stringpiece (const char* str, int n, void* dest);
|
||||
|
||||
#define DECLARE_INTEGER_PARSER(name) \
|
||||
private: \
|
||||
static bool parse_ ## name(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _radix( \
|
||||
const char* str, int n, void* dest, int radix); \
|
||||
public: \
|
||||
static bool parse_ ## name ## _hex(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _octal(const char* str, int n, void* dest); \
|
||||
static bool parse_ ## name ## _cradix(const char* str, int n, void* dest)
|
||||
|
||||
DECLARE_INTEGER_PARSER(short);
|
||||
DECLARE_INTEGER_PARSER(ushort);
|
||||
DECLARE_INTEGER_PARSER(int);
|
||||
DECLARE_INTEGER_PARSER(uint);
|
||||
DECLARE_INTEGER_PARSER(long);
|
||||
DECLARE_INTEGER_PARSER(ulong);
|
||||
#if RE2_HAVE_LONGLONG
|
||||
DECLARE_INTEGER_PARSER(longlong);
|
||||
DECLARE_INTEGER_PARSER(ulonglong);
|
||||
#endif
|
||||
|
||||
#undef DECLARE_INTEGER_PARSER
|
||||
};
|
||||
|
||||
inline RE2::Arg::Arg() : arg_(NULL), parser_(parse_null) { }
|
||||
inline RE2::Arg::Arg(void* p) : arg_(p), parser_(parse_null) { }
|
||||
|
||||
inline bool RE2::Arg::Parse(const char* str, int n) const {
|
||||
return (*parser_)(str, n, arg_);
|
||||
}
|
||||
|
||||
// This part of the parser, appropriate only for ints, deals with bases
|
||||
#define MAKE_INTEGER_PARSER(type, name) \
|
||||
inline RE2::Arg RE2::Hex(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _hex); } \
|
||||
inline RE2::Arg RE2::Octal(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _octal); } \
|
||||
inline RE2::Arg RE2::CRadix(type* ptr) { \
|
||||
return RE2::Arg(ptr, RE2::Arg::parse_ ## name ## _cradix); }
|
||||
|
||||
MAKE_INTEGER_PARSER(short, short)
|
||||
MAKE_INTEGER_PARSER(unsigned short, ushort)
|
||||
MAKE_INTEGER_PARSER(int, int)
|
||||
MAKE_INTEGER_PARSER(unsigned int, uint)
|
||||
MAKE_INTEGER_PARSER(long, long)
|
||||
MAKE_INTEGER_PARSER(unsigned long, ulong)
|
||||
#if RE2_HAVE_LONGLONG
|
||||
MAKE_INTEGER_PARSER(long long, longlong)
|
||||
MAKE_INTEGER_PARSER(unsigned long long, ulonglong)
|
||||
#endif
|
||||
|
||||
#undef MAKE_INTEGER_PARSER
|
||||
|
||||
} // namespace re2
|
||||
|
||||
using re2::RE2;
|
||||
|
||||
#endif /* RE2_RE2_H */
|
938
third_party/re2/re2/regexp.cc
vendored
938
third_party/re2/re2/regexp.cc
vendored
@ -1,938 +0,0 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Regular expression representation.
|
||||
// Tested by parse_test.cc
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Constructor. Allocates vectors as appropriate for operator.
|
||||
Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
|
||||
: op_(static_cast<uint8>(op)),
|
||||
simple_(false),
|
||||
parse_flags_(static_cast<uint16>(parse_flags)),
|
||||
ref_(1),
|
||||
nsub_(0),
|
||||
down_(NULL) {
|
||||
subone_ = NULL;
|
||||
memset(the_union_, 0, sizeof the_union_);
|
||||
}
|
||||
|
||||
// Destructor. Assumes already cleaned up children.
|
||||
// Private: use Decref() instead of delete to destroy Regexps.
|
||||
// Can't call Decref on the sub-Regexps here because
|
||||
// that could cause arbitrarily deep recursion, so
|
||||
// required Decref() to have handled them for us.
|
||||
Regexp::~Regexp() {
|
||||
if (nsub_ > 0)
|
||||
LOG(DFATAL) << "Regexp not destroyed.";
|
||||
|
||||
switch (op_) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpCapture:
|
||||
delete name_;
|
||||
break;
|
||||
case kRegexpLiteralString:
|
||||
delete[] runes_;
|
||||
break;
|
||||
case kRegexpCharClass:
|
||||
if (cc_)
|
||||
cc_->Delete();
|
||||
delete ccb_;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// If it's possible to destroy this regexp without recurring,
|
||||
// do so and return true. Else return false.
|
||||
bool Regexp::QuickDestroy() {
|
||||
if (nsub_ == 0) {
|
||||
delete this;
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static map<Regexp*, int> *ref_map;
|
||||
GLOBAL_MUTEX(ref_mutex);
|
||||
|
||||
int Regexp::Ref() {
|
||||
if (ref_ < kMaxRef)
|
||||
return ref_;
|
||||
|
||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
||||
int r = 0;
|
||||
if (ref_map != NULL) {
|
||||
r = (*ref_map)[this];
|
||||
}
|
||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
||||
return r;
|
||||
}
|
||||
|
||||
// Increments reference count, returns object as convenience.
|
||||
Regexp* Regexp::Incref() {
|
||||
if (ref_ >= kMaxRef-1) {
|
||||
// Store ref count in overflow map.
|
||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
||||
if (ref_map == NULL) {
|
||||
ref_map = new map<Regexp*, int>;
|
||||
}
|
||||
if (ref_ == kMaxRef) {
|
||||
// already overflowed
|
||||
(*ref_map)[this]++;
|
||||
} else {
|
||||
// overflowing now
|
||||
(*ref_map)[this] = kMaxRef;
|
||||
ref_ = kMaxRef;
|
||||
}
|
||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
||||
return this;
|
||||
}
|
||||
|
||||
ref_++;
|
||||
return this;
|
||||
}
|
||||
|
||||
// Decrements reference count and deletes this object if count reaches 0.
|
||||
void Regexp::Decref() {
|
||||
if (ref_ == kMaxRef) {
|
||||
// Ref count is stored in overflow map.
|
||||
GLOBAL_MUTEX_LOCK(ref_mutex);
|
||||
int r = (*ref_map)[this] - 1;
|
||||
if (r < kMaxRef) {
|
||||
ref_ = static_cast<uint16>(r);
|
||||
ref_map->erase(this);
|
||||
} else {
|
||||
(*ref_map)[this] = r;
|
||||
}
|
||||
GLOBAL_MUTEX_UNLOCK(ref_mutex);
|
||||
return;
|
||||
}
|
||||
ref_--;
|
||||
if (ref_ == 0)
|
||||
Destroy();
|
||||
}
|
||||
|
||||
// Deletes this object; ref count has count reached 0.
|
||||
void Regexp::Destroy() {
|
||||
if (QuickDestroy())
|
||||
return;
|
||||
|
||||
// Handle recursive Destroy with explicit stack
|
||||
// to avoid arbitrarily deep recursion on process stack [sigh].
|
||||
down_ = NULL;
|
||||
Regexp* stack = this;
|
||||
while (stack != NULL) {
|
||||
Regexp* re = stack;
|
||||
stack = re->down_;
|
||||
if (re->ref_ != 0)
|
||||
LOG(DFATAL) << "Bad reference count " << re->ref_;
|
||||
if (re->nsub_ > 0) {
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < re->nsub_; i++) {
|
||||
Regexp* sub = subs[i];
|
||||
if (sub == NULL)
|
||||
continue;
|
||||
if (sub->ref_ == kMaxRef)
|
||||
sub->Decref();
|
||||
else
|
||||
--sub->ref_;
|
||||
if (sub->ref_ == 0 && !sub->QuickDestroy()) {
|
||||
sub->down_ = stack;
|
||||
stack = sub;
|
||||
}
|
||||
}
|
||||
if (re->nsub_ > 1)
|
||||
delete[] subs;
|
||||
re->nsub_ = 0;
|
||||
}
|
||||
delete re;
|
||||
}
|
||||
}
|
||||
|
||||
void Regexp::AddRuneToString(Rune r) {
|
||||
DCHECK(op_ == kRegexpLiteralString);
|
||||
if (nrunes_ == 0) {
|
||||
// start with 8
|
||||
runes_ = new Rune[8];
|
||||
} else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
|
||||
// double on powers of two
|
||||
Rune *old = runes_;
|
||||
runes_ = new Rune[nrunes_ * 2];
|
||||
for (int i = 0; i < nrunes_; i++)
|
||||
runes_[i] = old[i];
|
||||
delete[] old;
|
||||
}
|
||||
|
||||
runes_[nrunes_++] = r;
|
||||
}
|
||||
|
||||
Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
|
||||
Regexp* re = new Regexp(kRegexpHaveMatch, flags);
|
||||
re->match_id_ = match_id;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpPlus, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpStar, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
|
||||
if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
|
||||
return sub;
|
||||
Regexp* re = new Regexp(kRegexpQuest, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
|
||||
ParseFlags flags, bool can_factor) {
|
||||
if (nsub == 1)
|
||||
return sub[0];
|
||||
|
||||
if (nsub == 0) {
|
||||
if (op == kRegexpAlternate)
|
||||
return new Regexp(kRegexpNoMatch, flags);
|
||||
else
|
||||
return new Regexp(kRegexpEmptyMatch, flags);
|
||||
}
|
||||
|
||||
Regexp** subcopy = NULL;
|
||||
if (op == kRegexpAlternate && can_factor) {
|
||||
// Going to edit sub; make a copy so we don't step on caller.
|
||||
subcopy = new Regexp*[nsub];
|
||||
memmove(subcopy, sub, nsub * sizeof sub[0]);
|
||||
sub = subcopy;
|
||||
nsub = FactorAlternation(sub, nsub, flags);
|
||||
if (nsub == 1) {
|
||||
Regexp* re = sub[0];
|
||||
delete[] subcopy;
|
||||
return re;
|
||||
}
|
||||
}
|
||||
|
||||
if (nsub > kMaxNsub) {
|
||||
// Too many subexpressions to fit in a single Regexp.
|
||||
// Make a two-level tree. Two levels gets us to 65535^2.
|
||||
int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
|
||||
Regexp* re = new Regexp(op, flags);
|
||||
re->AllocSub(nbigsub);
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < nbigsub - 1; i++)
|
||||
subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
|
||||
subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
|
||||
nsub - (nbigsub-1)*kMaxNsub, flags,
|
||||
false);
|
||||
delete[] subcopy;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* re = new Regexp(op, flags);
|
||||
re->AllocSub(nsub);
|
||||
Regexp** subs = re->sub();
|
||||
for (int i = 0; i < nsub; i++)
|
||||
subs[i] = sub[i];
|
||||
|
||||
delete[] subcopy;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
|
||||
return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
|
||||
}
|
||||
|
||||
Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
|
||||
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
|
||||
}
|
||||
|
||||
Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
|
||||
return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
|
||||
}
|
||||
|
||||
Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
|
||||
Regexp* re = new Regexp(kRegexpCapture, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
re->cap_ = cap;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
|
||||
Regexp* re = new Regexp(kRegexpRepeat, flags);
|
||||
re->AllocSub(1);
|
||||
re->sub()[0] = sub;
|
||||
re->min_ = min;
|
||||
re->max_ = max;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
|
||||
Regexp* re = new Regexp(kRegexpLiteral, flags);
|
||||
re->rune_ = rune;
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
|
||||
if (nrunes <= 0)
|
||||
return new Regexp(kRegexpEmptyMatch, flags);
|
||||
if (nrunes == 1)
|
||||
return NewLiteral(runes[0], flags);
|
||||
Regexp* re = new Regexp(kRegexpLiteralString, flags);
|
||||
for (int i = 0; i < nrunes; i++)
|
||||
re->AddRuneToString(runes[i]);
|
||||
return re;
|
||||
}
|
||||
|
||||
Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
|
||||
Regexp* re = new Regexp(kRegexpCharClass, flags);
|
||||
re->cc_ = cc;
|
||||
return re;
|
||||
}
|
||||
|
||||
// Swaps this and that in place.
|
||||
void Regexp::Swap(Regexp* that) {
|
||||
// Can use memmove because Regexp is just a struct (no vtable).
|
||||
char tmp[sizeof *this];
|
||||
memmove(tmp, this, sizeof tmp);
|
||||
memmove(this, that, sizeof tmp);
|
||||
memmove(that, tmp, sizeof tmp);
|
||||
}
|
||||
|
||||
// Tests equality of all top-level structure but not subregexps.
|
||||
static bool TopEqual(Regexp* a, Regexp* b) {
|
||||
if (a->op() != b->op())
|
||||
return false;
|
||||
|
||||
switch (a->op()) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpBeginText:
|
||||
return true;
|
||||
|
||||
case kRegexpEndText:
|
||||
// The parse flags remember whether it's \z or (?-m:$),
|
||||
// which matters when testing against PCRE.
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
|
||||
|
||||
case kRegexpLiteral:
|
||||
return a->rune() == b->rune() &&
|
||||
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
return a->nrunes() == b->nrunes() &&
|
||||
((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
|
||||
memcmp(a->runes(), b->runes(),
|
||||
a->nrunes() * sizeof a->runes()[0]) == 0;
|
||||
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
return a->nsub() == b->nsub();
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
|
||||
|
||||
case kRegexpRepeat:
|
||||
return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
|
||||
a->min() == b->min() &&
|
||||
a->max() == b->max();
|
||||
|
||||
case kRegexpCapture:
|
||||
return a->cap() == b->cap() && a->name() == b->name();
|
||||
|
||||
case kRegexpHaveMatch:
|
||||
return a->match_id() == b->match_id();
|
||||
|
||||
case kRegexpCharClass: {
|
||||
CharClass* acc = a->cc();
|
||||
CharClass* bcc = b->cc();
|
||||
return acc->size() == bcc->size() &&
|
||||
acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
|
||||
memcmp(acc->begin(), bcc->begin(),
|
||||
(acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool Regexp::Equal(Regexp* a, Regexp* b) {
|
||||
if (a == NULL || b == NULL)
|
||||
return a == b;
|
||||
|
||||
if (!TopEqual(a, b))
|
||||
return false;
|
||||
|
||||
// Fast path:
|
||||
// return without allocating vector if there are no subregexps.
|
||||
switch (a->op()) {
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
case kRegexpCapture:
|
||||
break;
|
||||
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
// Committed to doing real work.
|
||||
// The stack (vector) has pairs of regexps waiting to
|
||||
// be compared. The regexps are only equal if
|
||||
// all the pairs end up being equal.
|
||||
vector<Regexp*> stk;
|
||||
|
||||
for (;;) {
|
||||
// Invariant: TopEqual(a, b) == true.
|
||||
Regexp* a2;
|
||||
Regexp* b2;
|
||||
switch (a->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpAlternate:
|
||||
case kRegexpConcat:
|
||||
for (int i = 0; i < a->nsub(); i++) {
|
||||
a2 = a->sub()[i];
|
||||
b2 = b->sub()[i];
|
||||
if (!TopEqual(a2, b2))
|
||||
return false;
|
||||
stk.push_back(a2);
|
||||
stk.push_back(b2);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
case kRegexpCapture:
|
||||
a2 = a->sub()[0];
|
||||
b2 = b->sub()[0];
|
||||
if (!TopEqual(a2, b2))
|
||||
return false;
|
||||
// Really:
|
||||
// stk.push_back(a2);
|
||||
// stk.push_back(b2);
|
||||
// break;
|
||||
// but faster to assign directly and loop.
|
||||
a = a2;
|
||||
b = b2;
|
||||
continue;
|
||||
}
|
||||
|
||||
size_t n = stk.size();
|
||||
if (n == 0)
|
||||
break;
|
||||
|
||||
DCHECK_GE(n, 2);
|
||||
a = stk[n-2];
|
||||
b = stk[n-1];
|
||||
stk.resize(n-2);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// Keep in sync with enum RegexpStatusCode in regexp.h
|
||||
static const char *kErrorStrings[] = {
|
||||
"no error",
|
||||
"unexpected error",
|
||||
"invalid escape sequence",
|
||||
"invalid character class",
|
||||
"invalid character class range",
|
||||
"missing ]",
|
||||
"missing )",
|
||||
"trailing \\",
|
||||
"no argument for repetition operator",
|
||||
"invalid repetition size",
|
||||
"bad repetition operator",
|
||||
"invalid perl operator",
|
||||
"invalid UTF-8",
|
||||
"invalid named capture group",
|
||||
};
|
||||
|
||||
string RegexpStatus::CodeText(enum RegexpStatusCode code) {
|
||||
if (code < 0 || code >= arraysize(kErrorStrings))
|
||||
code = kRegexpInternalError;
|
||||
return kErrorStrings[code];
|
||||
}
|
||||
|
||||
string RegexpStatus::Text() const {
|
||||
if (error_arg_.empty())
|
||||
return CodeText(code_);
|
||||
string s;
|
||||
s.append(CodeText(code_));
|
||||
s.append(": ");
|
||||
s.append(error_arg_.data(), error_arg_.size());
|
||||
return s;
|
||||
}
|
||||
|
||||
void RegexpStatus::Copy(const RegexpStatus& status) {
|
||||
code_ = status.code_;
|
||||
error_arg_ = status.error_arg_;
|
||||
}
|
||||
|
||||
typedef int Ignored; // Walker<void> doesn't exist
|
||||
|
||||
// Walker subclass to count capturing parens in regexp.
|
||||
class NumCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
NumCapturesWalker() : ncapture_(0) {}
|
||||
int ncapture() { return ncapture_; }
|
||||
|
||||
virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
||||
if (re->op() == kRegexpCapture)
|
||||
ncapture_++;
|
||||
return ignored;
|
||||
}
|
||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
int ncapture_;
|
||||
DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker);
|
||||
};
|
||||
|
||||
int Regexp::NumCaptures() {
|
||||
NumCapturesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.ncapture();
|
||||
}
|
||||
|
||||
// Walker class to build map of named capture groups and their indices.
|
||||
class NamedCapturesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
NamedCapturesWalker() : map_(NULL) {}
|
||||
~NamedCapturesWalker() { delete map_; }
|
||||
|
||||
map<string, int>* TakeMap() {
|
||||
map<string, int>* m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new map<string, int>;
|
||||
|
||||
// Record first occurrence of each name.
|
||||
// (The rule is that if you have the same name
|
||||
// multiple times, only the leftmost one counts.)
|
||||
if (map_->find(*re->name()) == map_->end())
|
||||
(*map_)[*re->name()] = re->cap();
|
||||
}
|
||||
return ignored;
|
||||
}
|
||||
|
||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
map<string, int>* map_;
|
||||
DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker);
|
||||
};
|
||||
|
||||
map<string, int>* Regexp::NamedCaptures() {
|
||||
NamedCapturesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
}
|
||||
|
||||
// Walker class to build map from capture group indices to their names.
|
||||
class CaptureNamesWalker : public Regexp::Walker<Ignored> {
|
||||
public:
|
||||
CaptureNamesWalker() : map_(NULL) {}
|
||||
~CaptureNamesWalker() { delete map_; }
|
||||
|
||||
map<int, string>* TakeMap() {
|
||||
map<int, string>* m = map_;
|
||||
map_ = NULL;
|
||||
return m;
|
||||
}
|
||||
|
||||
Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
|
||||
if (re->op() == kRegexpCapture && re->name() != NULL) {
|
||||
// Allocate map once we find a name.
|
||||
if (map_ == NULL)
|
||||
map_ = new map<int, string>;
|
||||
|
||||
(*map_)[re->cap()] = *re->name();
|
||||
}
|
||||
return ignored;
|
||||
}
|
||||
|
||||
virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
|
||||
// Should never be called: we use Walk not WalkExponential.
|
||||
LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
|
||||
return ignored;
|
||||
}
|
||||
|
||||
private:
|
||||
map<int, string>* map_;
|
||||
DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker);
|
||||
};
|
||||
|
||||
map<int, string>* Regexp::CaptureNames() {
|
||||
CaptureNamesWalker w;
|
||||
w.Walk(this, 0);
|
||||
return w.TakeMap();
|
||||
}
|
||||
|
||||
// Determines whether regexp matches must be anchored
|
||||
// with a fixed string prefix. If so, returns the prefix and
|
||||
// the regexp that remains after the prefix. The prefix might
|
||||
// be ASCII case-insensitive.
|
||||
bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
|
||||
// No need for a walker: the regexp must be of the form
|
||||
// 1. some number of ^ anchors
|
||||
// 2. a literal char or string
|
||||
// 3. the rest
|
||||
prefix->clear();
|
||||
*foldcase = false;
|
||||
*suffix = NULL;
|
||||
if (op_ != kRegexpConcat)
|
||||
return false;
|
||||
|
||||
// Some number of anchors, then a literal or concatenation.
|
||||
int i = 0;
|
||||
Regexp** sub = this->sub();
|
||||
while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
|
||||
i++;
|
||||
if (i == 0 || i >= nsub_)
|
||||
return false;
|
||||
|
||||
Regexp* re = sub[i];
|
||||
switch (re->op_) {
|
||||
default:
|
||||
return false;
|
||||
|
||||
case kRegexpLiteralString:
|
||||
// Convert to string in proper encoding.
|
||||
if (re->parse_flags() & Latin1) {
|
||||
prefix->resize(re->nrunes_);
|
||||
for (int j = 0; j < re->nrunes_; j++)
|
||||
(*prefix)[j] = static_cast<char>(re->runes_[j]);
|
||||
} else {
|
||||
// Convert to UTF-8 in place.
|
||||
// Assume worst-case space and then trim.
|
||||
prefix->resize(re->nrunes_ * UTFmax);
|
||||
char *p = &(*prefix)[0];
|
||||
for (int j = 0; j < re->nrunes_; j++) {
|
||||
Rune r = re->runes_[j];
|
||||
if (r < Runeself)
|
||||
*p++ = static_cast<char>(r);
|
||||
else
|
||||
p += runetochar(p, &r);
|
||||
}
|
||||
prefix->resize(p - &(*prefix)[0]);
|
||||
}
|
||||
break;
|
||||
|
||||
case kRegexpLiteral:
|
||||
if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
|
||||
prefix->append(1, static_cast<char>(re->rune_));
|
||||
} else {
|
||||
char buf[UTFmax];
|
||||
prefix->append(buf, runetochar(buf, &re->rune_));
|
||||
}
|
||||
break;
|
||||
}
|
||||
*foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
|
||||
i++;
|
||||
|
||||
// The rest.
|
||||
if (i < nsub_) {
|
||||
for (int j = i; j < nsub_; j++)
|
||||
sub[j]->Incref();
|
||||
re = Concat(sub + i, nsub_ - i, parse_flags());
|
||||
} else {
|
||||
re = new Regexp(kRegexpEmptyMatch, parse_flags());
|
||||
}
|
||||
*suffix = re;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Character class builder is a balanced binary tree (STL set)
|
||||
// containing non-overlapping, non-abutting RuneRanges.
|
||||
// The less-than operator used in the tree treats two
|
||||
// ranges as equal if they overlap at all, so that
|
||||
// lookups for a particular Rune are possible.
|
||||
|
||||
CharClassBuilder::CharClassBuilder() {
|
||||
nrunes_ = 0;
|
||||
upper_ = 0;
|
||||
lower_ = 0;
|
||||
}
|
||||
|
||||
// Add lo-hi to the class; return whether class got bigger.
|
||||
bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
|
||||
if (hi < lo)
|
||||
return false;
|
||||
|
||||
if (lo <= 'z' && hi >= 'A') {
|
||||
// Overlaps some alpha, maybe not all.
|
||||
// Update bitmaps telling which ASCII letters are in the set.
|
||||
Rune lo1 = max<Rune>(lo, 'A');
|
||||
Rune hi1 = min<Rune>(hi, 'Z');
|
||||
if (lo1 <= hi1)
|
||||
upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
|
||||
|
||||
lo1 = max<Rune>(lo, 'a');
|
||||
hi1 = min<Rune>(hi, 'z');
|
||||
if (lo1 <= hi1)
|
||||
lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
|
||||
}
|
||||
|
||||
{ // Check whether lo, hi is already in the class.
|
||||
iterator it = ranges_.find(RuneRange(lo, lo));
|
||||
if (it != end() && it->lo <= lo && hi <= it->hi)
|
||||
return false;
|
||||
}
|
||||
|
||||
// Look for a range abutting lo on the left.
|
||||
// If it exists, take it out and increase our range.
|
||||
if (lo > 0) {
|
||||
iterator it = ranges_.find(RuneRange(lo-1, lo-1));
|
||||
if (it != end()) {
|
||||
lo = it->lo;
|
||||
if (it->hi > hi)
|
||||
hi = it->hi;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
// Look for a range abutting hi on the right.
|
||||
// If it exists, take it out and increase our range.
|
||||
if (hi < Runemax) {
|
||||
iterator it = ranges_.find(RuneRange(hi+1, hi+1));
|
||||
if (it != end()) {
|
||||
hi = it->hi;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
}
|
||||
|
||||
// Look for ranges between lo and hi. Take them out.
|
||||
// This is only safe because the set has no overlapping ranges.
|
||||
// We've already removed any ranges abutting lo and hi, so
|
||||
// any that overlap [lo, hi] must be contained within it.
|
||||
for (;;) {
|
||||
iterator it = ranges_.find(RuneRange(lo, hi));
|
||||
if (it == end())
|
||||
break;
|
||||
nrunes_ -= it->hi - it->lo + 1;
|
||||
ranges_.erase(it);
|
||||
}
|
||||
|
||||
// Finally, add [lo, hi].
|
||||
nrunes_ += hi - lo + 1;
|
||||
ranges_.insert(RuneRange(lo, hi));
|
||||
return true;
|
||||
}
|
||||
|
||||
void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
|
||||
for (iterator it = cc->begin(); it != cc->end(); ++it)
|
||||
AddRange(it->lo, it->hi);
|
||||
}
|
||||
|
||||
bool CharClassBuilder::Contains(Rune r) {
|
||||
return ranges_.find(RuneRange(r, r)) != end();
|
||||
}
|
||||
|
||||
// Does the character class behave the same on A-Z as on a-z?
|
||||
bool CharClassBuilder::FoldsASCII() {
|
||||
return ((upper_ ^ lower_) & AlphaMask) == 0;
|
||||
}
|
||||
|
||||
CharClassBuilder* CharClassBuilder::Copy() {
|
||||
CharClassBuilder* cc = new CharClassBuilder;
|
||||
for (iterator it = begin(); it != end(); ++it)
|
||||
cc->ranges_.insert(RuneRange(it->lo, it->hi));
|
||||
cc->upper_ = upper_;
|
||||
cc->lower_ = lower_;
|
||||
cc->nrunes_ = nrunes_;
|
||||
return cc;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void CharClassBuilder::RemoveAbove(Rune r) {
|
||||
if (r >= Runemax)
|
||||
return;
|
||||
|
||||
if (r < 'z') {
|
||||
if (r < 'a')
|
||||
lower_ = 0;
|
||||
else
|
||||
lower_ &= AlphaMask >> ('z' - r);
|
||||
}
|
||||
|
||||
if (r < 'Z') {
|
||||
if (r < 'A')
|
||||
upper_ = 0;
|
||||
else
|
||||
upper_ &= AlphaMask >> ('Z' - r);
|
||||
}
|
||||
|
||||
for (;;) {
|
||||
|
||||
iterator it = ranges_.find(RuneRange(r + 1, Runemax));
|
||||
if (it == end())
|
||||
break;
|
||||
RuneRange rr = *it;
|
||||
ranges_.erase(it);
|
||||
nrunes_ -= rr.hi - rr.lo + 1;
|
||||
if (rr.lo <= r) {
|
||||
rr.hi = r;
|
||||
ranges_.insert(rr);
|
||||
nrunes_ += rr.hi - rr.lo + 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void CharClassBuilder::Negate() {
|
||||
// Build up negation and then copy in.
|
||||
// Could edit ranges in place, but C++ won't let me.
|
||||
vector<RuneRange> v;
|
||||
v.reserve(ranges_.size() + 1);
|
||||
|
||||
// In negation, first range begins at 0, unless
|
||||
// the current class begins at 0.
|
||||
iterator it = begin();
|
||||
if (it == end()) {
|
||||
v.push_back(RuneRange(0, Runemax));
|
||||
} else {
|
||||
int nextlo = 0;
|
||||
if (it->lo == 0) {
|
||||
nextlo = it->hi + 1;
|
||||
++it;
|
||||
}
|
||||
for (; it != end(); ++it) {
|
||||
v.push_back(RuneRange(nextlo, it->lo - 1));
|
||||
nextlo = it->hi + 1;
|
||||
}
|
||||
if (nextlo <= Runemax)
|
||||
v.push_back(RuneRange(nextlo, Runemax));
|
||||
}
|
||||
|
||||
ranges_.clear();
|
||||
for (size_t i = 0; i < v.size(); i++)
|
||||
ranges_.insert(v[i]);
|
||||
|
||||
upper_ = AlphaMask & ~upper_;
|
||||
lower_ = AlphaMask & ~lower_;
|
||||
nrunes_ = Runemax+1 - nrunes_;
|
||||
}
|
||||
|
||||
// Character class is a sorted list of ranges.
|
||||
// The ranges are allocated in the same block as the header,
|
||||
// necessitating a special allocator and Delete method.
|
||||
|
||||
CharClass* CharClass::New(int maxranges) {
|
||||
CharClass* cc;
|
||||
uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
|
||||
cc = reinterpret_cast<CharClass*>(data);
|
||||
cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
|
||||
cc->nranges_ = 0;
|
||||
cc->folds_ascii_ = false;
|
||||
cc->nrunes_ = 0;
|
||||
return cc;
|
||||
}
|
||||
|
||||
void CharClass::Delete() {
|
||||
uint8 *data = reinterpret_cast<uint8*>(this);
|
||||
delete[] data;
|
||||
}
|
||||
|
||||
CharClass* CharClass::Negate() {
|
||||
CharClass* cc = CharClass::New(nranges_+1);
|
||||
cc->folds_ascii_ = folds_ascii_;
|
||||
cc->nrunes_ = Runemax + 1 - nrunes_;
|
||||
int n = 0;
|
||||
int nextlo = 0;
|
||||
for (CharClass::iterator it = begin(); it != end(); ++it) {
|
||||
if (it->lo == nextlo) {
|
||||
nextlo = it->hi + 1;
|
||||
} else {
|
||||
cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
|
||||
nextlo = it->hi + 1;
|
||||
}
|
||||
}
|
||||
if (nextlo <= Runemax)
|
||||
cc->ranges_[n++] = RuneRange(nextlo, Runemax);
|
||||
cc->nranges_ = n;
|
||||
return cc;
|
||||
}
|
||||
|
||||
bool CharClass::Contains(Rune r) {
|
||||
RuneRange* rr = ranges_;
|
||||
int n = nranges_;
|
||||
while (n > 0) {
|
||||
int m = n/2;
|
||||
if (rr[m].hi < r) {
|
||||
rr += m+1;
|
||||
n -= m+1;
|
||||
} else if (r < rr[m].lo) {
|
||||
n = m;
|
||||
} else { // rr[m].lo <= r && r <= rr[m].hi
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
CharClass* CharClassBuilder::GetCharClass() {
|
||||
CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
|
||||
int n = 0;
|
||||
for (iterator it = begin(); it != end(); ++it)
|
||||
cc->ranges_[n++] = *it;
|
||||
cc->nranges_ = n;
|
||||
DCHECK_LE(n, static_cast<int>(ranges_.size()));
|
||||
cc->nrunes_ = nrunes_;
|
||||
cc->folds_ascii_ = FoldsASCII();
|
||||
return cc;
|
||||
}
|
||||
|
||||
} // namespace re2
|
635
third_party/re2/re2/regexp.h
vendored
635
third_party/re2/re2/regexp.h
vendored
@ -1,635 +0,0 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// --- SPONSORED LINK --------------------------------------------------
|
||||
// If you want to use this library for regular expression matching,
|
||||
// you should use re2/re2.h, which provides a class RE2 that
|
||||
// mimics the PCRE interface provided by PCRE's C++ wrappers.
|
||||
// This header describes the low-level interface used to implement RE2
|
||||
// and may change in backwards-incompatible ways from time to time.
|
||||
// In contrast, RE2's interface will not.
|
||||
// ---------------------------------------------------------------------
|
||||
|
||||
// Regular expression library: parsing, execution, and manipulation
|
||||
// of regular expressions.
|
||||
//
|
||||
// Any operation that traverses the Regexp structures should be written
|
||||
// using Regexp::Walker (see walker-inl.h), not recursively, because deeply nested
|
||||
// regular expressions such as x++++++++++++++++++++... might cause recursive
|
||||
// traversals to overflow the stack.
|
||||
//
|
||||
// It is the caller's responsibility to provide appropriate mutual exclusion
|
||||
// around manipulation of the regexps. RE2 does this.
|
||||
//
|
||||
// PARSING
|
||||
//
|
||||
// Regexp::Parse parses regular expressions encoded in UTF-8.
|
||||
// The default syntax is POSIX extended regular expressions,
|
||||
// with the following changes:
|
||||
//
|
||||
// 1. Backreferences (optional in POSIX EREs) are not supported.
|
||||
// (Supporting them precludes the use of DFA-based
|
||||
// matching engines.)
|
||||
//
|
||||
// 2. Collating elements and collation classes are not supported.
|
||||
// (No one has needed or wanted them.)
|
||||
//
|
||||
// The exact syntax accepted can be modified by passing flags to
|
||||
// Regexp::Parse. In particular, many of the basic Perl additions
|
||||
// are available. The flags are documented below (search for LikePerl).
|
||||
//
|
||||
// If parsed with the flag Regexp::Latin1, both the regular expression
|
||||
// and the input to the matching routines are assumed to be encoded in
|
||||
// Latin-1, not UTF-8.
|
||||
//
|
||||
// EXECUTION
|
||||
//
|
||||
// Once Regexp has parsed a regular expression, it provides methods
|
||||
// to search text using that regular expression. These methods are
|
||||
// implemented via calling out to other regular expression libraries.
|
||||
// (Let's call them the sublibraries.)
|
||||
//
|
||||
// To call a sublibrary, Regexp does not simply prepare a
|
||||
// string version of the regular expression and hand it to the
|
||||
// sublibrary. Instead, Regexp prepares, from its own parsed form, the
|
||||
// corresponding internal representation used by the sublibrary.
|
||||
// This has the drawback of needing to know the internal representation
|
||||
// used by the sublibrary, but it has two important benefits:
|
||||
//
|
||||
// 1. The syntax and meaning of regular expressions is guaranteed
|
||||
// to be that used by Regexp's parser, not the syntax expected
|
||||
// by the sublibrary. Regexp might accept a restricted or
|
||||
// expanded syntax for regular expressions as compared with
|
||||
// the sublibrary. As long as Regexp can translate from its
|
||||
// internal form into the sublibrary's, clients need not know
|
||||
// exactly which sublibrary they are using.
|
||||
//
|
||||
// 2. The sublibrary parsers are bypassed. For whatever reason,
|
||||
// sublibrary regular expression parsers often have security
|
||||
// problems. For example, plan9grep's regular expression parser
|
||||
// has a buffer overflow in its handling of large character
|
||||
// classes, and PCRE's parser has had buffer overflow problems
|
||||
// in the past. Security-team requires sandboxing of sublibrary
|
||||
// regular expression parsers. Avoiding the sublibrary parsers
|
||||
// avoids the sandbox.
|
||||
//
|
||||
// The execution methods we use now are provided by the compiled form,
|
||||
// Prog, described in prog.h
|
||||
//
|
||||
// MANIPULATION
|
||||
//
|
||||
// Unlike other regular expression libraries, Regexp makes its parsed
|
||||
// form accessible to clients, so that client code can analyze the
|
||||
// parsed regular expressions.
|
||||
|
||||
#ifndef RE2_REGEXP_H__
|
||||
#define RE2_REGEXP_H__
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/stringpiece.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Keep in sync with string list kOpcodeNames[] in testing/dump.cc
|
||||
enum RegexpOp {
|
||||
// Matches no strings.
|
||||
kRegexpNoMatch = 1,
|
||||
|
||||
// Matches empty string.
|
||||
kRegexpEmptyMatch,
|
||||
|
||||
// Matches rune_.
|
||||
kRegexpLiteral,
|
||||
|
||||
// Matches runes_.
|
||||
kRegexpLiteralString,
|
||||
|
||||
// Matches concatenation of sub_[0..nsub-1].
|
||||
kRegexpConcat,
|
||||
// Matches union of sub_[0..nsub-1].
|
||||
kRegexpAlternate,
|
||||
|
||||
// Matches sub_[0] zero or more times.
|
||||
kRegexpStar,
|
||||
// Matches sub_[0] one or more times.
|
||||
kRegexpPlus,
|
||||
// Matches sub_[0] zero or one times.
|
||||
kRegexpQuest,
|
||||
|
||||
// Matches sub_[0] at least min_ times, at most max_ times.
|
||||
// max_ == -1 means no upper limit.
|
||||
kRegexpRepeat,
|
||||
|
||||
// Parenthesized (capturing) subexpression. Index is cap_.
|
||||
// Optionally, capturing name is name_.
|
||||
kRegexpCapture,
|
||||
|
||||
// Matches any character.
|
||||
kRegexpAnyChar,
|
||||
|
||||
// Matches any byte [sic].
|
||||
kRegexpAnyByte,
|
||||
|
||||
// Matches empty string at beginning of line.
|
||||
kRegexpBeginLine,
|
||||
// Matches empty string at end of line.
|
||||
kRegexpEndLine,
|
||||
|
||||
// Matches word boundary "\b".
|
||||
kRegexpWordBoundary,
|
||||
// Matches not-a-word boundary "\B".
|
||||
kRegexpNoWordBoundary,
|
||||
|
||||
// Matches empty string at beginning of text.
|
||||
kRegexpBeginText,
|
||||
// Matches empty string at end of text.
|
||||
kRegexpEndText,
|
||||
|
||||
// Matches character class given by cc_.
|
||||
kRegexpCharClass,
|
||||
|
||||
// Forces match of entire expression right now,
|
||||
// with match ID match_id_ (used by RE2::Set).
|
||||
kRegexpHaveMatch,
|
||||
|
||||
kMaxRegexpOp = kRegexpHaveMatch,
|
||||
};
|
||||
|
||||
// Keep in sync with string list in regexp.cc
|
||||
enum RegexpStatusCode {
|
||||
// No error
|
||||
kRegexpSuccess = 0,
|
||||
|
||||
// Unexpected error
|
||||
kRegexpInternalError,
|
||||
|
||||
// Parse errors
|
||||
kRegexpBadEscape, // bad escape sequence
|
||||
kRegexpBadCharClass, // bad character class
|
||||
kRegexpBadCharRange, // bad character class range
|
||||
kRegexpMissingBracket, // missing closing ]
|
||||
kRegexpMissingParen, // missing closing )
|
||||
kRegexpTrailingBackslash, // at end of regexp
|
||||
kRegexpRepeatArgument, // repeat argument missing, e.g. "*"
|
||||
kRegexpRepeatSize, // bad repetition argument
|
||||
kRegexpRepeatOp, // bad repetition operator
|
||||
kRegexpBadPerlOp, // bad perl operator
|
||||
kRegexpBadUTF8, // invalid UTF-8 in regexp
|
||||
kRegexpBadNamedCapture, // bad named capture
|
||||
};
|
||||
|
||||
// Error status for certain operations.
|
||||
class RegexpStatus {
|
||||
public:
|
||||
RegexpStatus() : code_(kRegexpSuccess), tmp_(NULL) {}
|
||||
~RegexpStatus() { delete tmp_; }
|
||||
|
||||
void set_code(enum RegexpStatusCode code) { code_ = code; }
|
||||
void set_error_arg(const StringPiece& error_arg) { error_arg_ = error_arg; }
|
||||
void set_tmp(string* tmp) { delete tmp_; tmp_ = tmp; }
|
||||
enum RegexpStatusCode code() const { return code_; }
|
||||
const StringPiece& error_arg() const { return error_arg_; }
|
||||
bool ok() const { return code() == kRegexpSuccess; }
|
||||
|
||||
// Copies state from status.
|
||||
void Copy(const RegexpStatus& status);
|
||||
|
||||
// Returns text equivalent of code, e.g.:
|
||||
// "Bad character class"
|
||||
static string CodeText(enum RegexpStatusCode code);
|
||||
|
||||
// Returns text describing error, e.g.:
|
||||
// "Bad character class: [z-a]"
|
||||
string Text() const;
|
||||
|
||||
private:
|
||||
enum RegexpStatusCode code_; // Kind of error
|
||||
StringPiece error_arg_; // Piece of regexp containing syntax error.
|
||||
string* tmp_; // Temporary storage, possibly where error_arg_ is.
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(RegexpStatus);
|
||||
};
|
||||
|
||||
// Walkers to implement Simplify.
|
||||
class CoalesceWalker;
|
||||
class SimplifyWalker;
|
||||
|
||||
// Compiled form; see prog.h
|
||||
class Prog;
|
||||
|
||||
struct RuneRange {
|
||||
RuneRange() : lo(0), hi(0) { }
|
||||
RuneRange(int l, int h) : lo(l), hi(h) { }
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
};
|
||||
|
||||
// Less-than on RuneRanges treats a == b if they overlap at all.
|
||||
// This lets us look in a set to find the range covering a particular Rune.
|
||||
struct RuneRangeLess {
|
||||
bool operator()(const RuneRange& a, const RuneRange& b) const {
|
||||
return a.hi < b.lo;
|
||||
}
|
||||
};
|
||||
|
||||
class CharClassBuilder;
|
||||
|
||||
class CharClass {
|
||||
public:
|
||||
void Delete();
|
||||
|
||||
typedef RuneRange* iterator;
|
||||
iterator begin() { return ranges_; }
|
||||
iterator end() { return ranges_ + nranges_; }
|
||||
|
||||
int size() { return nrunes_; }
|
||||
bool empty() { return nrunes_ == 0; }
|
||||
bool full() { return nrunes_ == Runemax+1; }
|
||||
bool FoldsASCII() { return folds_ascii_; }
|
||||
|
||||
bool Contains(Rune r);
|
||||
CharClass* Negate();
|
||||
|
||||
private:
|
||||
CharClass(); // not implemented
|
||||
~CharClass(); // not implemented
|
||||
static CharClass* New(int maxranges);
|
||||
|
||||
friend class CharClassBuilder;
|
||||
|
||||
bool folds_ascii_;
|
||||
int nrunes_;
|
||||
RuneRange *ranges_;
|
||||
int nranges_;
|
||||
DISALLOW_COPY_AND_ASSIGN(CharClass);
|
||||
};
|
||||
|
||||
class Regexp {
|
||||
public:
|
||||
|
||||
// Flags for parsing. Can be ORed together.
|
||||
enum ParseFlags {
|
||||
NoParseFlags = 0,
|
||||
FoldCase = 1<<0, // Fold case during matching (case-insensitive).
|
||||
Literal = 1<<1, // Treat s as literal string instead of a regexp.
|
||||
ClassNL = 1<<2, // Allow char classes like [^a-z] and \D and \s
|
||||
// and [[:space:]] to match newline.
|
||||
DotNL = 1<<3, // Allow . to match newline.
|
||||
MatchNL = ClassNL | DotNL,
|
||||
OneLine = 1<<4, // Treat ^ and $ as only matching at beginning and
|
||||
// end of text, not around embedded newlines.
|
||||
// (Perl's default)
|
||||
Latin1 = 1<<5, // Regexp and text are in Latin1, not UTF-8.
|
||||
NonGreedy = 1<<6, // Repetition operators are non-greedy by default.
|
||||
PerlClasses = 1<<7, // Allow Perl character classes like \d.
|
||||
PerlB = 1<<8, // Allow Perl's \b and \B.
|
||||
PerlX = 1<<9, // Perl extensions:
|
||||
// non-capturing parens - (?: )
|
||||
// non-greedy operators - *? +? ?? {}?
|
||||
// flag edits - (?i) (?-i) (?i: )
|
||||
// i - FoldCase
|
||||
// m - !OneLine
|
||||
// s - DotNL
|
||||
// U - NonGreedy
|
||||
// line ends: \A \z
|
||||
// \Q and \E to disable/enable metacharacters
|
||||
// (?P<name>expr) for named captures
|
||||
// \C to match any single byte
|
||||
UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
|
||||
// and \P{Han} for its negation.
|
||||
NeverNL = 1<<11, // Never match NL, even if the regexp mentions
|
||||
// it explicitly.
|
||||
NeverCapture = 1<<12, // Parse all parens as non-capturing.
|
||||
|
||||
// As close to Perl as we can get.
|
||||
LikePerl = ClassNL | OneLine | PerlClasses | PerlB | PerlX |
|
||||
UnicodeGroups,
|
||||
|
||||
// Internal use only.
|
||||
WasDollar = 1<<15, // on kRegexpEndText: was $ in regexp text
|
||||
};
|
||||
|
||||
// Get. No set, Regexps are logically immutable once created.
|
||||
RegexpOp op() { return static_cast<RegexpOp>(op_); }
|
||||
int nsub() { return nsub_; }
|
||||
bool simple() { return simple_ != 0; }
|
||||
enum ParseFlags parse_flags() { return static_cast<ParseFlags>(parse_flags_); }
|
||||
int Ref(); // For testing.
|
||||
|
||||
Regexp** sub() {
|
||||
if(nsub_ <= 1)
|
||||
return &subone_;
|
||||
else
|
||||
return submany_;
|
||||
}
|
||||
|
||||
int min() { DCHECK_EQ(op_, kRegexpRepeat); return min_; }
|
||||
int max() { DCHECK_EQ(op_, kRegexpRepeat); return max_; }
|
||||
Rune rune() { DCHECK_EQ(op_, kRegexpLiteral); return rune_; }
|
||||
CharClass* cc() { DCHECK_EQ(op_, kRegexpCharClass); return cc_; }
|
||||
int cap() { DCHECK_EQ(op_, kRegexpCapture); return cap_; }
|
||||
const string* name() { DCHECK_EQ(op_, kRegexpCapture); return name_; }
|
||||
Rune* runes() { DCHECK_EQ(op_, kRegexpLiteralString); return runes_; }
|
||||
int nrunes() { DCHECK_EQ(op_, kRegexpLiteralString); return nrunes_; }
|
||||
int match_id() { DCHECK_EQ(op_, kRegexpHaveMatch); return match_id_; }
|
||||
|
||||
// Increments reference count, returns object as convenience.
|
||||
Regexp* Incref();
|
||||
|
||||
// Decrements reference count and deletes this object if count reaches 0.
|
||||
void Decref();
|
||||
|
||||
// Parses string s to produce regular expression, returned.
|
||||
// Caller must release return value with re->Decref().
|
||||
// On failure, sets *status (if status != NULL) and returns NULL.
|
||||
static Regexp* Parse(const StringPiece& s, ParseFlags flags,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Returns a _new_ simplified version of the current regexp.
|
||||
// Does not edit the current regexp.
|
||||
// Caller must release return value with re->Decref().
|
||||
// Simplified means that counted repetition has been rewritten
|
||||
// into simpler terms and all Perl/POSIX features have been
|
||||
// removed. The result will capture exactly the same
|
||||
// subexpressions the original did, unless formatted with ToString.
|
||||
Regexp* Simplify();
|
||||
friend class CoalesceWalker;
|
||||
friend class SimplifyWalker;
|
||||
|
||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||
// string representation of the simplified form. Returns true on success.
|
||||
// Returns false and sets *status (if status != NULL) on parse error.
|
||||
static bool SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
||||
string* dst,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Returns the number of capturing groups in the regexp.
|
||||
int NumCaptures();
|
||||
friend class NumCapturesWalker;
|
||||
|
||||
// Returns a map from names to capturing group indices,
|
||||
// or NULL if the regexp contains no named capture groups.
|
||||
// The caller is responsible for deleting the map.
|
||||
map<string, int>* NamedCaptures();
|
||||
|
||||
// Returns a map from capturing group indices to capturing group
|
||||
// names or NULL if the regexp contains no named capture groups. The
|
||||
// caller is responsible for deleting the map.
|
||||
map<int, string>* CaptureNames();
|
||||
|
||||
// Returns a string representation of the current regexp,
|
||||
// using as few parentheses as possible.
|
||||
string ToString();
|
||||
|
||||
// Convenience functions. They consume the passed reference,
|
||||
// so in many cases you should use, e.g., Plus(re->Incref(), flags).
|
||||
// They do not consume allocated arrays like subs or runes.
|
||||
static Regexp* Plus(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Star(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Quest(Regexp* sub, ParseFlags flags);
|
||||
static Regexp* Concat(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
static Regexp* Alternate(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
static Regexp* Capture(Regexp* sub, ParseFlags flags, int cap);
|
||||
static Regexp* Repeat(Regexp* sub, ParseFlags flags, int min, int max);
|
||||
static Regexp* NewLiteral(Rune rune, ParseFlags flags);
|
||||
static Regexp* NewCharClass(CharClass* cc, ParseFlags flags);
|
||||
static Regexp* LiteralString(Rune* runes, int nrunes, ParseFlags flags);
|
||||
static Regexp* HaveMatch(int match_id, ParseFlags flags);
|
||||
|
||||
// Like Alternate but does not factor out common prefixes.
|
||||
static Regexp* AlternateNoFactor(Regexp** subs, int nsubs, ParseFlags flags);
|
||||
|
||||
// Debugging function. Returns string format for regexp
|
||||
// that makes structure clear. Does NOT use regexp syntax.
|
||||
string Dump();
|
||||
|
||||
// Helper traversal class, defined fully in walker-inl.h.
|
||||
template<typename T> class Walker;
|
||||
|
||||
// Compile to Prog. See prog.h
|
||||
// Reverse prog expects to be run over text backward.
|
||||
// Construction and execution of prog will
|
||||
// stay within approximately max_mem bytes of memory.
|
||||
// If max_mem <= 0, a reasonable default is used.
|
||||
Prog* CompileToProg(int64 max_mem);
|
||||
Prog* CompileToReverseProg(int64 max_mem);
|
||||
|
||||
// Whether to expect this library to find exactly the same answer as PCRE
|
||||
// when running this regexp. Most regexps do mimic PCRE exactly, but a few
|
||||
// obscure cases behave differently. Technically this is more a property
|
||||
// of the Prog than the Regexp, but the computation is much easier to do
|
||||
// on the Regexp. See mimics_pcre.cc for the exact conditions.
|
||||
bool MimicsPCRE();
|
||||
|
||||
// Benchmarking function.
|
||||
void NullWalk();
|
||||
|
||||
// Whether every match of this regexp must be anchored and
|
||||
// begin with a non-empty fixed string (perhaps after ASCII
|
||||
// case-folding). If so, returns the prefix and the sub-regexp that
|
||||
// follows it.
|
||||
bool RequiredPrefix(string* prefix, bool *foldcase, Regexp** suffix);
|
||||
|
||||
private:
|
||||
// Constructor allocates vectors as appropriate for operator.
|
||||
explicit Regexp(RegexpOp op, ParseFlags parse_flags);
|
||||
|
||||
// Use Decref() instead of delete to release Regexps.
|
||||
// This is private to catch deletes at compile time.
|
||||
~Regexp();
|
||||
void Destroy();
|
||||
bool QuickDestroy();
|
||||
|
||||
// Helpers for Parse. Listed here so they can edit Regexps.
|
||||
class ParseState;
|
||||
friend class ParseState;
|
||||
friend bool ParseCharClass(StringPiece* s, Regexp** out_re,
|
||||
RegexpStatus* status);
|
||||
|
||||
// Helper for testing [sic].
|
||||
friend bool RegexpEqualTestingOnly(Regexp*, Regexp*);
|
||||
|
||||
// Computes whether Regexp is already simple.
|
||||
bool ComputeSimple();
|
||||
|
||||
// Constructor that generates a concatenation or alternation,
|
||||
// enforcing the limit on the number of subexpressions for
|
||||
// a particular Regexp.
|
||||
static Regexp* ConcatOrAlternate(RegexpOp op, Regexp** subs, int nsubs,
|
||||
ParseFlags flags, bool can_factor);
|
||||
|
||||
// Returns the leading string that re starts with.
|
||||
// The returned Rune* points into a piece of re,
|
||||
// so it must not be used after the caller calls re->Decref().
|
||||
static Rune* LeadingString(Regexp* re, int* nrune, ParseFlags* flags);
|
||||
|
||||
// Removes the first n leading runes from the beginning of re.
|
||||
// Edits re in place.
|
||||
static void RemoveLeadingString(Regexp* re, int n);
|
||||
|
||||
// Returns the leading regexp in re's top-level concatenation.
|
||||
// The returned Regexp* points at re or a sub-expression of re,
|
||||
// so it must not be used after the caller calls re->Decref().
|
||||
static Regexp* LeadingRegexp(Regexp* re);
|
||||
|
||||
// Removes LeadingRegexp(re) from re and returns the remainder.
|
||||
// Might edit re in place.
|
||||
static Regexp* RemoveLeadingRegexp(Regexp* re);
|
||||
|
||||
// Simplifies an alternation of literal strings by factoring out
|
||||
// common prefixes.
|
||||
static int FactorAlternation(Regexp** sub, int nsub, ParseFlags flags);
|
||||
static int FactorAlternationRecursive(Regexp** sub, int nsub,
|
||||
ParseFlags flags, int maxdepth);
|
||||
|
||||
// Is a == b? Only efficient on regexps that have not been through
|
||||
// Simplify yet - the expansion of a kRegexpRepeat will make this
|
||||
// take a long time. Do not call on such regexps, hence private.
|
||||
static bool Equal(Regexp* a, Regexp* b);
|
||||
|
||||
// Allocate space for n sub-regexps.
|
||||
void AllocSub(int n) {
|
||||
if (n < 0 || static_cast<uint16>(n) != n)
|
||||
LOG(FATAL) << "Cannot AllocSub " << n;
|
||||
if (n > 1)
|
||||
submany_ = new Regexp*[n];
|
||||
nsub_ = n;
|
||||
}
|
||||
|
||||
// Add Rune to LiteralString
|
||||
void AddRuneToString(Rune r);
|
||||
|
||||
// Swaps this with that, in place.
|
||||
void Swap(Regexp *that);
|
||||
|
||||
// Operator. See description of operators above.
|
||||
// uint8 instead of RegexpOp to control space usage.
|
||||
uint8 op_;
|
||||
|
||||
// Is this regexp structure already simple
|
||||
// (has it been returned by Simplify)?
|
||||
// uint8 instead of bool to control space usage.
|
||||
uint8 simple_;
|
||||
|
||||
// Flags saved from parsing and used during execution.
|
||||
// (Only FoldCase is used.)
|
||||
// uint16 instead of ParseFlags to control space usage.
|
||||
uint16 parse_flags_;
|
||||
|
||||
// Reference count. Exists so that SimplifyRegexp can build
|
||||
// regexp structures that are dags rather than trees to avoid
|
||||
// exponential blowup in space requirements.
|
||||
// uint16 to control space usage.
|
||||
// The standard regexp routines will never generate a
|
||||
// ref greater than the maximum repeat count (100),
|
||||
// but even so, Incref and Decref consult an overflow map
|
||||
// when ref_ reaches kMaxRef.
|
||||
uint16 ref_;
|
||||
static const uint16 kMaxRef = 0xffff;
|
||||
|
||||
// Subexpressions.
|
||||
// uint16 to control space usage.
|
||||
// Concat and Alternate handle larger numbers of subexpressions
|
||||
// by building concatenation or alternation trees.
|
||||
// Other routines should call Concat or Alternate instead of
|
||||
// filling in sub() by hand.
|
||||
uint16 nsub_;
|
||||
static const uint16 kMaxNsub = 0xffff;
|
||||
union {
|
||||
Regexp** submany_; // if nsub_ > 1
|
||||
Regexp* subone_; // if nsub_ == 1
|
||||
};
|
||||
|
||||
// Extra space for parse and teardown stacks.
|
||||
Regexp* down_;
|
||||
|
||||
// Arguments to operator. See description of operators above.
|
||||
union {
|
||||
struct { // Repeat
|
||||
int max_;
|
||||
int min_;
|
||||
};
|
||||
struct { // Capture
|
||||
int cap_;
|
||||
string* name_;
|
||||
};
|
||||
struct { // LiteralString
|
||||
int nrunes_;
|
||||
Rune* runes_;
|
||||
};
|
||||
struct { // CharClass
|
||||
// These two could be in separate union members,
|
||||
// but it wouldn't save any space (there are other two-word structs)
|
||||
// and keeping them separate avoids confusion during parsing.
|
||||
CharClass* cc_;
|
||||
CharClassBuilder* ccb_;
|
||||
};
|
||||
Rune rune_; // Literal
|
||||
int match_id_; // HaveMatch
|
||||
void *the_union_[2]; // as big as any other element, for memset
|
||||
};
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(Regexp);
|
||||
};
|
||||
|
||||
// Character class set: contains non-overlapping, non-abutting RuneRanges.
|
||||
typedef set<RuneRange, RuneRangeLess> RuneRangeSet;
|
||||
|
||||
class CharClassBuilder {
|
||||
public:
|
||||
CharClassBuilder();
|
||||
|
||||
typedef RuneRangeSet::iterator iterator;
|
||||
iterator begin() { return ranges_.begin(); }
|
||||
iterator end() { return ranges_.end(); }
|
||||
|
||||
int size() { return nrunes_; }
|
||||
bool empty() { return nrunes_ == 0; }
|
||||
bool full() { return nrunes_ == Runemax+1; }
|
||||
|
||||
bool Contains(Rune r);
|
||||
bool FoldsASCII();
|
||||
bool AddRange(Rune lo, Rune hi); // returns whether class changed
|
||||
CharClassBuilder* Copy();
|
||||
void AddCharClass(CharClassBuilder* cc);
|
||||
void Negate();
|
||||
void RemoveAbove(Rune r);
|
||||
CharClass* GetCharClass();
|
||||
void AddRangeFlags(Rune lo, Rune hi, Regexp::ParseFlags parse_flags);
|
||||
|
||||
private:
|
||||
static const uint32 AlphaMask = (1<<26) - 1;
|
||||
uint32 upper_; // bitmap of A-Z
|
||||
uint32 lower_; // bitmap of a-z
|
||||
int nrunes_;
|
||||
RuneRangeSet ranges_;
|
||||
DISALLOW_COPY_AND_ASSIGN(CharClassBuilder);
|
||||
};
|
||||
|
||||
// Tell g++ that bitwise ops on ParseFlags produce ParseFlags.
|
||||
inline Regexp::ParseFlags operator|(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) | static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator^(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) ^ static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator&(Regexp::ParseFlags a, Regexp::ParseFlags b)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(static_cast<int>(a) & static_cast<int>(b));
|
||||
}
|
||||
|
||||
inline Regexp::ParseFlags operator~(Regexp::ParseFlags a)
|
||||
{
|
||||
return static_cast<Regexp::ParseFlags>(~static_cast<int>(a));
|
||||
}
|
||||
|
||||
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_REGEXP_H__
|
113
third_party/re2/re2/set.cc
vendored
113
third_party/re2/re2/set.cc
vendored
@ -1,113 +0,0 @@
|
||||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/set.h"
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
using namespace re2;
|
||||
|
||||
RE2::Set::Set(const RE2::Options& options, RE2::Anchor anchor) {
|
||||
options_.Copy(options);
|
||||
anchor_ = anchor;
|
||||
prog_ = NULL;
|
||||
compiled_ = false;
|
||||
}
|
||||
|
||||
RE2::Set::~Set() {
|
||||
for (size_t i = 0; i < re_.size(); i++)
|
||||
re_[i]->Decref();
|
||||
delete prog_;
|
||||
}
|
||||
|
||||
int RE2::Set::Add(const StringPiece& pattern, string* error) {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Add after Compile";
|
||||
return -1;
|
||||
}
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
|
||||
RegexpStatus status;
|
||||
re2::Regexp* re = Regexp::Parse(pattern, pf, &status);
|
||||
if (re == NULL) {
|
||||
if (error != NULL)
|
||||
*error = status.Text();
|
||||
if (options_.log_errors())
|
||||
LOG(ERROR) << "Error parsing '" << pattern << "': " << status.Text();
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Concatenate with match index and push on vector.
|
||||
int n = static_cast<int>(re_.size());
|
||||
re2::Regexp* m = re2::Regexp::HaveMatch(n, pf);
|
||||
if (re->op() == kRegexpConcat) {
|
||||
int nsub = re->nsub();
|
||||
re2::Regexp** sub = new re2::Regexp*[nsub + 1];
|
||||
for (int i = 0; i < nsub; i++)
|
||||
sub[i] = re->sub()[i]->Incref();
|
||||
sub[nsub] = m;
|
||||
re->Decref();
|
||||
re = re2::Regexp::Concat(sub, nsub + 1, pf);
|
||||
delete[] sub;
|
||||
} else {
|
||||
re2::Regexp* sub[2];
|
||||
sub[0] = re;
|
||||
sub[1] = m;
|
||||
re = re2::Regexp::Concat(sub, 2, pf);
|
||||
}
|
||||
re_.push_back(re);
|
||||
return n;
|
||||
}
|
||||
|
||||
bool RE2::Set::Compile() {
|
||||
if (compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Compile multiple times";
|
||||
return false;
|
||||
}
|
||||
compiled_ = true;
|
||||
|
||||
Regexp::ParseFlags pf = static_cast<Regexp::ParseFlags>(
|
||||
options_.ParseFlags());
|
||||
re2::Regexp* re = re2::Regexp::Alternate(const_cast<re2::Regexp**>(re_.data()),
|
||||
static_cast<int>(re_.size()), pf);
|
||||
re_.clear();
|
||||
re2::Regexp* sre = re->Simplify();
|
||||
re->Decref();
|
||||
re = sre;
|
||||
if (re == NULL) {
|
||||
if (options_.log_errors())
|
||||
LOG(ERROR) << "Error simplifying during Compile.";
|
||||
return false;
|
||||
}
|
||||
|
||||
prog_ = Prog::CompileSet(options_, anchor_, re);
|
||||
return prog_ != NULL;
|
||||
}
|
||||
|
||||
bool RE2::Set::Match(const StringPiece& text, vector<int>* v) const {
|
||||
if (!compiled_) {
|
||||
LOG(DFATAL) << "RE2::Set::Match without Compile";
|
||||
return false;
|
||||
}
|
||||
v->clear();
|
||||
bool failed;
|
||||
bool ret = prog_->SearchDFA(text, text, Prog::kAnchored,
|
||||
Prog::kManyMatch, NULL, &failed, v);
|
||||
if (failed)
|
||||
LOG(DFATAL) << "RE2::Set::Match: DFA ran out of cache space";
|
||||
|
||||
if (ret == false)
|
||||
return false;
|
||||
if (v->size() == 0) {
|
||||
LOG(DFATAL) << "RE2::Set::Match: match but unknown regexp set";
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
55
third_party/re2/re2/set.h
vendored
55
third_party/re2/re2/set.h
vendored
@ -1,55 +0,0 @@
|
||||
// Copyright 2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_SET_H
|
||||
#define RE2_SET_H
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "re2/re2.h"
|
||||
|
||||
namespace re2 {
|
||||
using std::vector;
|
||||
|
||||
// An RE2::Set represents a collection of regexps that can
|
||||
// be searched for simultaneously.
|
||||
class RE2::Set {
|
||||
public:
|
||||
Set(const RE2::Options& options, RE2::Anchor anchor);
|
||||
~Set();
|
||||
|
||||
// Add adds regexp pattern to the set, interpreted using the RE2 options.
|
||||
// (The RE2 constructor's default options parameter is RE2::UTF8.)
|
||||
// Add returns the regexp index that will be used to identify
|
||||
// it in the result of Match, or -1 if the regexp cannot be parsed.
|
||||
// Indices are assigned in sequential order starting from 0.
|
||||
// Error returns do not increment the index.
|
||||
// If an error occurs and error != NULL, *error will hold an error message.
|
||||
int Add(const StringPiece& pattern, string* error);
|
||||
|
||||
// Compile prepares the Set for matching.
|
||||
// Add must not be called again after Compile.
|
||||
// Compile must be called before FullMatch or PartialMatch.
|
||||
// Compile may return false if it runs out of memory.
|
||||
bool Compile();
|
||||
|
||||
// Match returns true if text matches any of the regexps in the set.
|
||||
// If so, it fills v with the indices of the matching regexps.
|
||||
bool Match(const StringPiece& text, vector<int>* v) const;
|
||||
|
||||
private:
|
||||
RE2::Options options_;
|
||||
RE2::Anchor anchor_;
|
||||
vector<re2::Regexp*> re_;
|
||||
re2::Prog* prog_;
|
||||
bool compiled_;
|
||||
//DISALLOW_COPY_AND_ASSIGN(Set);
|
||||
Set(const Set&);
|
||||
void operator=(const Set&);
|
||||
};
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_SET_H
|
652
third_party/re2/re2/simplify.cc
vendored
652
third_party/re2/re2/simplify.cc
vendored
@ -1,652 +0,0 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Rewrite POSIX and other features in re
|
||||
// to use simple extended regular expression features.
|
||||
// Also sort and simplify character classes.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/walker-inl.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Parses the regexp src and then simplifies it and sets *dst to the
|
||||
// string representation of the simplified form. Returns true on success.
|
||||
// Returns false and sets *error (if error != NULL) on error.
|
||||
bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
|
||||
string* dst,
|
||||
RegexpStatus* status) {
|
||||
Regexp* re = Parse(src, flags, status);
|
||||
if (re == NULL)
|
||||
return false;
|
||||
Regexp* sre = re->Simplify();
|
||||
re->Decref();
|
||||
if (sre == NULL) {
|
||||
// Should not happen, since Simplify never fails.
|
||||
LOG(ERROR) << "Simplify failed on " << src;
|
||||
if (status) {
|
||||
status->set_code(kRegexpInternalError);
|
||||
status->set_error_arg(src);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
*dst = sre->ToString();
|
||||
sre->Decref();
|
||||
return true;
|
||||
}
|
||||
|
||||
// Assuming the simple_ flags on the children are accurate,
|
||||
// is this Regexp* simple?
|
||||
bool Regexp::ComputeSimple() {
|
||||
Regexp** subs;
|
||||
switch (op_) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpLiteral:
|
||||
case kRegexpLiteralString:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpEndText:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpHaveMatch:
|
||||
return true;
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate:
|
||||
// These are simple as long as the subpieces are simple.
|
||||
subs = sub();
|
||||
for (int i = 0; i < nsub_; i++)
|
||||
if (!subs[i]->simple())
|
||||
return false;
|
||||
return true;
|
||||
case kRegexpCharClass:
|
||||
// Simple as long as the char class is not empty, not full.
|
||||
if (ccb_ != NULL)
|
||||
return !ccb_->empty() && !ccb_->full();
|
||||
return !cc_->empty() && !cc_->full();
|
||||
case kRegexpCapture:
|
||||
subs = sub();
|
||||
return subs[0]->simple();
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
subs = sub();
|
||||
if (!subs[0]->simple())
|
||||
return false;
|
||||
switch (subs[0]->op_) {
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpNoMatch:
|
||||
return false;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return true;
|
||||
case kRegexpRepeat:
|
||||
return false;
|
||||
}
|
||||
LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Walker subclass used by Simplify.
|
||||
// Coalesces runs of star/plus/quest/repeat of the same literal along with any
|
||||
// occurrences of that literal into repeats of that literal. It also works for
|
||||
// char classes, any char and any byte.
|
||||
// PostVisit creates the coalesced result, which should then be simplified.
|
||||
class CoalesceWalker : public Regexp::Walker<Regexp*> {
|
||||
public:
|
||||
CoalesceWalker() {}
|
||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||
Regexp** child_args, int nchild_args);
|
||||
virtual Regexp* Copy(Regexp* re);
|
||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||
|
||||
private:
|
||||
// These functions are declared inside CoalesceWalker so that
|
||||
// they can edit the private fields of the Regexps they construct.
|
||||
|
||||
// Returns true if r1 and r2 can be coalesced. In particular, ensures that
|
||||
// the parse flags are consistent. (They will not be checked again later.)
|
||||
static bool CanCoalesce(Regexp* r1, Regexp* r2);
|
||||
|
||||
// Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
|
||||
// will be empty match and the coalesced op. In other cases, where part of a
|
||||
// literal string was removed to be coalesced, the array elements afterwards
|
||||
// will be the coalesced op and the remainder of the literal string.
|
||||
static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);
|
||||
};
|
||||
|
||||
// Walker subclass used by Simplify.
|
||||
// The simplify walk is purely post-recursive: given the simplified children,
|
||||
// PostVisit creates the simplified result.
|
||||
// The child_args are simplified Regexp*s.
|
||||
class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
||||
public:
|
||||
SimplifyWalker() {}
|
||||
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
||||
virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
||||
Regexp** child_args, int nchild_args);
|
||||
virtual Regexp* Copy(Regexp* re);
|
||||
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
||||
|
||||
private:
|
||||
// These functions are declared inside SimplifyWalker so that
|
||||
// they can edit the private fields of the Regexps they construct.
|
||||
|
||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
|
||||
|
||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
|
||||
Regexp::ParseFlags parse_flags);
|
||||
|
||||
// Simplifies a character class by expanding any named classes
|
||||
// into rune ranges. Does not edit re. Does not consume ref to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
static Regexp* SimplifyCharClass(Regexp* re);
|
||||
|
||||
DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);
|
||||
};
|
||||
|
||||
// Simplifies a regular expression, returning a new regexp.
|
||||
// The new regexp uses traditional Unix egrep features only,
|
||||
// plus the Perl (?:) non-capturing parentheses.
|
||||
// Otherwise, no POSIX or Perl additions. The new regexp
|
||||
// captures exactly the same subexpressions (with the same indices)
|
||||
// as the original.
|
||||
// Does not edit current object.
|
||||
// Caller must Decref() return value when done with it.
|
||||
|
||||
Regexp* Regexp::Simplify() {
|
||||
CoalesceWalker cw;
|
||||
Regexp* cre = cw.Walk(this, NULL);
|
||||
if (cre == NULL)
|
||||
return cre;
|
||||
SimplifyWalker sw;
|
||||
Regexp* sre = sw.Walk(cre, NULL);
|
||||
cre->Decref();
|
||||
return sre;
|
||||
}
|
||||
|
||||
#define Simplify DontCallSimplify // Avoid accidental recursion
|
||||
|
||||
// Utility function for PostVisit implementations that compares re->sub() with
|
||||
// child_args to determine whether any child_args changed. In the common case,
|
||||
// where nothing changed, calls Decref() for all child_args and returns false,
|
||||
// so PostVisit must return re->Incref(). Otherwise, returns true.
|
||||
static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
Regexp* sub = re->sub()[i];
|
||||
Regexp* newsub = child_args[i];
|
||||
if (newsub != sub)
|
||||
return true;
|
||||
}
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
Regexp* newsub = child_args[i];
|
||||
newsub->Decref();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::Copy(Regexp* re) {
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||
// This should never be called, since we use Walk and not
|
||||
// WalkExponential.
|
||||
LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* CoalesceWalker::PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
Regexp** child_args,
|
||||
int nchild_args) {
|
||||
if (re->nsub() == 0)
|
||||
return re->Incref();
|
||||
|
||||
if (re->op() != kRegexpConcat) {
|
||||
if (!ChildArgsChanged(re, child_args))
|
||||
return re->Incref();
|
||||
|
||||
// Something changed. Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
// Repeats and Captures have additional data that must be copied.
|
||||
if (re->op() == kRegexpRepeat) {
|
||||
nre->min_ = re->min();
|
||||
nre->max_ = re->max();
|
||||
} else if (re->op() == kRegexpCapture) {
|
||||
nre->cap_ = re->cap();
|
||||
}
|
||||
return nre;
|
||||
}
|
||||
|
||||
bool can_coalesce = false;
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
if (i+1 < re->nsub() &&
|
||||
CanCoalesce(child_args[i], child_args[i+1])) {
|
||||
can_coalesce = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!can_coalesce) {
|
||||
if (!ChildArgsChanged(re, child_args))
|
||||
return re->Incref();
|
||||
|
||||
// Something changed. Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
return nre;
|
||||
}
|
||||
|
||||
for (int i = 0; i < re->nsub(); i++) {
|
||||
if (i+1 < re->nsub() &&
|
||||
CanCoalesce(child_args[i], child_args[i+1]))
|
||||
DoCoalesce(&child_args[i], &child_args[i+1]);
|
||||
}
|
||||
// Determine how many empty matches were left by DoCoalesce.
|
||||
int n = 0;
|
||||
for (int i = n; i < re->nsub(); i++) {
|
||||
if (child_args[i]->op() == kRegexpEmptyMatch)
|
||||
n++;
|
||||
}
|
||||
// Build a new op.
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub() - n);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0, j = 0; i < re->nsub(); i++) {
|
||||
if (child_args[i]->op() == kRegexpEmptyMatch) {
|
||||
child_args[i]->Decref();
|
||||
continue;
|
||||
}
|
||||
nre_subs[j] = child_args[i];
|
||||
j++;
|
||||
}
|
||||
return nre;
|
||||
}
|
||||
|
||||
bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
|
||||
// r1 must be a star/plus/quest/repeat of a literal, char class, any char or
|
||||
// any byte.
|
||||
if ((r1->op() == kRegexpStar ||
|
||||
r1->op() == kRegexpPlus ||
|
||||
r1->op() == kRegexpQuest ||
|
||||
r1->op() == kRegexpRepeat) &&
|
||||
(r1->sub()[0]->op() == kRegexpLiteral ||
|
||||
r1->sub()[0]->op() == kRegexpCharClass ||
|
||||
r1->sub()[0]->op() == kRegexpAnyChar ||
|
||||
r1->sub()[0]->op() == kRegexpAnyByte)) {
|
||||
// r2 must be a star/plus/quest/repeat of the same literal, char class,
|
||||
// any char or any byte.
|
||||
if ((r2->op() == kRegexpStar ||
|
||||
r2->op() == kRegexpPlus ||
|
||||
r2->op() == kRegexpQuest ||
|
||||
r2->op() == kRegexpRepeat) &&
|
||||
Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
|
||||
// The parse flags must be consistent.
|
||||
((r1->parse_flags() & Regexp::NonGreedy) ==
|
||||
(r2->parse_flags() & Regexp::NonGreedy))) {
|
||||
return true;
|
||||
}
|
||||
// ... OR an occurrence of that literal, char class, any char or any byte
|
||||
if (Regexp::Equal(r1->sub()[0], r2)) {
|
||||
return true;
|
||||
}
|
||||
// ... OR a literal string that begins with that literal.
|
||||
if (r1->sub()[0]->op() == kRegexpLiteral &&
|
||||
r2->op() == kRegexpLiteralString &&
|
||||
r2->runes()[0] == r1->sub()[0]->rune() &&
|
||||
// The parse flags must be consistent.
|
||||
((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
|
||||
(r2->parse_flags() & Regexp::FoldCase))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
|
||||
Regexp* r1 = *r1ptr;
|
||||
Regexp* r2 = *r2ptr;
|
||||
|
||||
Regexp* nre = Regexp::Repeat(
|
||||
r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
|
||||
|
||||
switch (r1->op()) {
|
||||
case kRegexpStar:
|
||||
nre->min_ = 0;
|
||||
nre->max_ = -1;
|
||||
break;
|
||||
|
||||
case kRegexpPlus:
|
||||
nre->min_ = 1;
|
||||
nre->max_ = -1;
|
||||
break;
|
||||
|
||||
case kRegexpQuest:
|
||||
nre->min_ = 0;
|
||||
nre->max_ = 1;
|
||||
break;
|
||||
|
||||
case kRegexpRepeat:
|
||||
nre->min_ = r1->min();
|
||||
nre->max_ = r1->max();
|
||||
break;
|
||||
|
||||
default:
|
||||
LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
|
||||
nre->Decref();
|
||||
return;
|
||||
}
|
||||
|
||||
switch (r2->op()) {
|
||||
case kRegexpStar:
|
||||
nre->max_ = -1;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpPlus:
|
||||
nre->min_++;
|
||||
nre->max_ = -1;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpQuest:
|
||||
if (nre->max() != -1)
|
||||
nre->max_++;
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpRepeat:
|
||||
nre->min_ += r2->min();
|
||||
if (r2->max() == -1)
|
||||
nre->max_ = -1;
|
||||
else if (nre->max() != -1)
|
||||
nre->max_ += r2->max();
|
||||
goto LeaveEmpty;
|
||||
|
||||
case kRegexpLiteral:
|
||||
case kRegexpCharClass:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
nre->min_++;
|
||||
if (nre->max() != -1)
|
||||
nre->max_++;
|
||||
goto LeaveEmpty;
|
||||
|
||||
LeaveEmpty:
|
||||
*r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
|
||||
*r2ptr = nre;
|
||||
break;
|
||||
|
||||
case kRegexpLiteralString: {
|
||||
Rune r = r1->sub()[0]->rune();
|
||||
// Determine how much of the literal string is removed.
|
||||
// We know that we have at least one rune. :)
|
||||
int n = 1;
|
||||
while (n < r2->nrunes() && r2->runes()[n] == r)
|
||||
n++;
|
||||
nre->min_ += n;
|
||||
if (nre->max() != -1)
|
||||
nre->max_ += n;
|
||||
if (n == r2->nrunes())
|
||||
goto LeaveEmpty;
|
||||
*r1ptr = nre;
|
||||
*r2ptr = Regexp::LiteralString(
|
||||
&r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
|
||||
nre->Decref();
|
||||
return;
|
||||
}
|
||||
|
||||
r1->Decref();
|
||||
r2->Decref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::Copy(Regexp* re) {
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
||||
// This should never be called, since we use Walk and not
|
||||
// WalkExponential.
|
||||
LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
||||
if (re->simple()) {
|
||||
*stop = true;
|
||||
return re->Incref();
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
||||
Regexp* parent_arg,
|
||||
Regexp* pre_arg,
|
||||
Regexp** child_args,
|
||||
int nchild_args) {
|
||||
switch (re->op()) {
|
||||
case kRegexpNoMatch:
|
||||
case kRegexpEmptyMatch:
|
||||
case kRegexpLiteral:
|
||||
case kRegexpLiteralString:
|
||||
case kRegexpBeginLine:
|
||||
case kRegexpEndLine:
|
||||
case kRegexpBeginText:
|
||||
case kRegexpWordBoundary:
|
||||
case kRegexpNoWordBoundary:
|
||||
case kRegexpEndText:
|
||||
case kRegexpAnyChar:
|
||||
case kRegexpAnyByte:
|
||||
case kRegexpHaveMatch:
|
||||
// All these are always simple.
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate: {
|
||||
// These are simple as long as the subpieces are simple.
|
||||
if (!ChildArgsChanged(re, child_args)) {
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(re->nsub());
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
nre_subs[i] = child_args[i];
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpCapture: {
|
||||
Regexp* newsub = child_args[0];
|
||||
if (newsub == re->sub()[0]) {
|
||||
newsub->Decref();
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
||||
nre->AllocSub(1);
|
||||
nre->sub()[0] = newsub;
|
||||
nre->cap_ = re->cap();
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest: {
|
||||
Regexp* newsub = child_args[0];
|
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if (newsub->op() == kRegexpEmptyMatch)
|
||||
return newsub;
|
||||
|
||||
// These are simple as long as the subpiece is simple.
|
||||
if (newsub == re->sub()[0]) {
|
||||
newsub->Decref();
|
||||
re->simple_ = true;
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
// These are also idempotent if flags are constant.
|
||||
if (re->op() == newsub->op() &&
|
||||
re->parse_flags() == newsub->parse_flags())
|
||||
return newsub;
|
||||
|
||||
Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
||||
nre->AllocSub(1);
|
||||
nre->sub()[0] = newsub;
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpRepeat: {
|
||||
Regexp* newsub = child_args[0];
|
||||
// Special case: repeat the empty string as much as
|
||||
// you want, but it's still the empty string.
|
||||
if (newsub->op() == kRegexpEmptyMatch)
|
||||
return newsub;
|
||||
|
||||
Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
|
||||
re->parse_flags());
|
||||
newsub->Decref();
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
|
||||
case kRegexpCharClass: {
|
||||
Regexp* nre = SimplifyCharClass(re);
|
||||
nre->simple_ = true;
|
||||
return nre;
|
||||
}
|
||||
}
|
||||
|
||||
LOG(ERROR) << "Simplify case not handled: " << re->op();
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
|
||||
// Returns a new Regexp, handing the ref to the caller.
|
||||
Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
|
||||
Regexp::ParseFlags parse_flags) {
|
||||
Regexp* re = new Regexp(kRegexpConcat, parse_flags);
|
||||
re->AllocSub(2);
|
||||
Regexp** subs = re->sub();
|
||||
subs[0] = re1;
|
||||
subs[1] = re2;
|
||||
return re;
|
||||
}
|
||||
|
||||
// Simplifies the expression re{min,max} in terms of *, +, and ?.
|
||||
// Returns a new regexp. Does not edit re. Does not consume reference to re.
|
||||
// Caller must Decref return value when done with it.
|
||||
// The result will *not* necessarily have the right capturing parens
|
||||
// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
|
||||
// but in the Regexp* representation, both (x) are marked as $1.
|
||||
Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
||||
Regexp::ParseFlags f) {
|
||||
// x{n,} means at least n matches of x.
|
||||
if (max == -1) {
|
||||
// Special case: x{0,} is x*
|
||||
if (min == 0)
|
||||
return Regexp::Star(re->Incref(), f);
|
||||
|
||||
// Special case: x{1,} is x+
|
||||
if (min == 1)
|
||||
return Regexp::Plus(re->Incref(), f);
|
||||
|
||||
// General case: x{4,} is xxxx+
|
||||
Regexp* nre = new Regexp(kRegexpConcat, f);
|
||||
nre->AllocSub(min);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < min-1; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
|
||||
return nre;
|
||||
}
|
||||
|
||||
// Special case: (x){0} matches only empty string.
|
||||
if (min == 0 && max == 0)
|
||||
return new Regexp(kRegexpEmptyMatch, f);
|
||||
|
||||
// Special case: x{1} is just x.
|
||||
if (min == 1 && max == 1)
|
||||
return re->Incref();
|
||||
|
||||
// General case: x{n,m} means n copies of x and m copies of x?.
|
||||
// The machine will do less work if we nest the final m copies,
|
||||
// so that x{2,5} = xx(x(x(x)?)?)?
|
||||
|
||||
// Build leading prefix: xx. Capturing only on the last one.
|
||||
Regexp* nre = NULL;
|
||||
if (min > 0) {
|
||||
nre = new Regexp(kRegexpConcat, f);
|
||||
nre->AllocSub(min);
|
||||
Regexp** nre_subs = nre->sub();
|
||||
for (int i = 0; i < min; i++)
|
||||
nre_subs[i] = re->Incref();
|
||||
}
|
||||
|
||||
// Build and attach suffix: (x(x(x)?)?)?
|
||||
if (max > min) {
|
||||
Regexp* suf = Regexp::Quest(re->Incref(), f);
|
||||
for (int i = min+1; i < max; i++)
|
||||
suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
|
||||
if (nre == NULL)
|
||||
nre = suf;
|
||||
else
|
||||
nre = Concat2(nre, suf, f);
|
||||
}
|
||||
|
||||
if (nre == NULL) {
|
||||
// Some degenerate case, like min > max, or min < max < 0.
|
||||
// This shouldn't happen, because the parser rejects such regexps.
|
||||
LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
|
||||
return new Regexp(kRegexpNoMatch, f);
|
||||
}
|
||||
|
||||
return nre;
|
||||
}
|
||||
|
||||
// Simplifies a character class.
|
||||
// Caller must Decref return value when done with it.
|
||||
Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
|
||||
CharClass* cc = re->cc();
|
||||
|
||||
// Special cases
|
||||
if (cc->empty())
|
||||
return new Regexp(kRegexpNoMatch, re->parse_flags());
|
||||
if (cc->full())
|
||||
return new Regexp(kRegexpAnyChar, re->parse_flags());
|
||||
|
||||
return re->Incref();
|
||||
}
|
||||
|
||||
} // namespace re2
|
98
third_party/re2/re2/stringpiece.cc
vendored
98
third_party/re2/re2/stringpiece.cc
vendored
@ -1,98 +0,0 @@
|
||||
// Copyright 2004 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "re2/stringpiece.h"
|
||||
#include "util/util.h"
|
||||
|
||||
using re2::StringPiece;
|
||||
|
||||
std::ostream& operator<<(std::ostream& o, const StringPiece& piece) {
|
||||
o.write(piece.data(), piece.size());
|
||||
return o;
|
||||
}
|
||||
|
||||
bool StringPiece::_equal(const StringPiece& x, const StringPiece& y) {
|
||||
int len = x.size();
|
||||
if (len != y.size()) {
|
||||
return false;
|
||||
}
|
||||
const char* p = x.data();
|
||||
const char* p2 = y.data();
|
||||
// Test last byte in case strings share large common prefix
|
||||
if ((len > 0) && (p[len-1] != p2[len-1])) return false;
|
||||
const char* p_limit = p + len;
|
||||
for (; p < p_limit; p++, p2++) {
|
||||
if (*p != *p2)
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
void StringPiece::CopyToString(string* target) const {
|
||||
target->assign(ptr_, length_);
|
||||
}
|
||||
|
||||
void StringPiece::AppendToString(string* target) const {
|
||||
target->append(ptr_, length_);
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::copy(char* buf, size_type n,
|
||||
size_type pos) const {
|
||||
size_type ret = min(length_ - pos, n);
|
||||
memcpy(buf, ptr_ + pos, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool StringPiece::contains(StringPiece s) const {
|
||||
return find(s, 0) != npos;
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::find(const StringPiece& s,
|
||||
size_type pos) const {
|
||||
if (length_ < 0 || pos > static_cast<size_type>(length_))
|
||||
return npos;
|
||||
|
||||
const char* result = std::search(ptr_ + pos, ptr_ + length_,
|
||||
s.ptr_, s.ptr_ + s.length_);
|
||||
const size_type xpos = result - ptr_;
|
||||
return xpos + s.length_ <= static_cast<size_type>(length_) ? xpos : npos;
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::find(char c, size_type pos) const {
|
||||
if (length_ <= 0 || pos >= static_cast<size_type>(length_)) {
|
||||
return npos;
|
||||
}
|
||||
const char* result = std::find(ptr_ + pos, ptr_ + length_, c);
|
||||
return result != ptr_ + length_ ? result - ptr_ : npos;
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::rfind(const StringPiece& s,
|
||||
size_type pos) const {
|
||||
if (length_ < s.length_) return npos;
|
||||
const size_type ulen = length_;
|
||||
if (s.length_ == 0) return min(ulen, pos);
|
||||
|
||||
const char* last = ptr_ + min(ulen - s.length_, pos) + s.length_;
|
||||
const char* result = std::find_end(ptr_, last, s.ptr_, s.ptr_ + s.length_);
|
||||
return result != last ? result - ptr_ : npos;
|
||||
}
|
||||
|
||||
StringPiece::size_type StringPiece::rfind(char c, size_type pos) const {
|
||||
if (length_ <= 0) return npos;
|
||||
for (int i = static_cast<int>(min(pos, static_cast<size_type>(length_ - 1)));
|
||||
i >= 0; --i) {
|
||||
if (ptr_[i] == c) {
|
||||
return i;
|
||||
}
|
||||
}
|
||||
return npos;
|
||||
}
|
||||
|
||||
StringPiece StringPiece::substr(size_type pos, size_type n) const {
|
||||
if (pos > static_cast<size_type>(length_)) pos = static_cast<size_type>(length_);
|
||||
if (n > length_ - pos) n = length_ - pos;
|
||||
return StringPiece(ptr_ + pos, static_cast<int>(n));
|
||||
}
|
||||
|
||||
const StringPiece::size_type StringPiece::npos = size_type(-1);
|
185
third_party/re2/re2/stringpiece.h
vendored
185
third_party/re2/re2/stringpiece.h
vendored
@ -1,185 +0,0 @@
|
||||
// Copyright 2001-2010 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// A string-like object that points to a sized piece of memory.
|
||||
//
|
||||
// Functions or methods may use const StringPiece& parameters to accept either
|
||||
// a "const char*" or a "string" value that will be implicitly converted to
|
||||
// a StringPiece. The implicit conversion means that it is often appropriate
|
||||
// to include this .h file in other files rather than forward-declaring
|
||||
// StringPiece as would be appropriate for most other Google classes.
|
||||
//
|
||||
// Systematic usage of StringPiece is encouraged as it will reduce unnecessary
|
||||
// conversions from "const char*" to "string" and back again.
|
||||
//
|
||||
//
|
||||
// Arghh! I wish C++ literals were "string".
|
||||
|
||||
#ifndef STRINGS_STRINGPIECE_H__
|
||||
#define STRINGS_STRINGPIECE_H__
|
||||
|
||||
#include <string.h>
|
||||
#include <algorithm>
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <string>
|
||||
|
||||
namespace re2 {
|
||||
|
||||
class StringPiece {
|
||||
private:
|
||||
const char* ptr_;
|
||||
int length_;
|
||||
|
||||
public:
|
||||
// We provide non-explicit singleton constructors so users can pass
|
||||
// in a "const char*" or a "string" wherever a "StringPiece" is
|
||||
// expected.
|
||||
StringPiece() : ptr_(NULL), length_(0) { }
|
||||
StringPiece(const char* str)
|
||||
: ptr_(str), length_((str == NULL) ? 0 : static_cast<int>(strlen(str))) { }
|
||||
StringPiece(const std::string& str)
|
||||
: ptr_(str.data()), length_(static_cast<int>(str.size())) { }
|
||||
StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { }
|
||||
|
||||
// data() may return a pointer to a buffer with embedded NULs, and the
|
||||
// returned buffer may or may not be null terminated. Therefore it is
|
||||
// typically a mistake to pass data() to a routine that expects a NUL
|
||||
// terminated string.
|
||||
const char* data() const { return ptr_; }
|
||||
int size() const { return length_; }
|
||||
int length() const { return length_; }
|
||||
bool empty() const { return length_ == 0; }
|
||||
|
||||
void clear() { ptr_ = NULL; length_ = 0; }
|
||||
void set(const char* data, int len) { ptr_ = data; length_ = len; }
|
||||
void set(const char* str) {
|
||||
ptr_ = str;
|
||||
if (str != NULL)
|
||||
length_ = static_cast<int>(strlen(str));
|
||||
else
|
||||
length_ = 0;
|
||||
}
|
||||
void set(const void* data, int len) {
|
||||
ptr_ = reinterpret_cast<const char*>(data);
|
||||
length_ = len;
|
||||
}
|
||||
|
||||
char operator[](int i) const { return ptr_[i]; }
|
||||
|
||||
void remove_prefix(int n) {
|
||||
ptr_ += n;
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
void remove_suffix(int n) {
|
||||
length_ -= n;
|
||||
}
|
||||
|
||||
int compare(const StringPiece& x) const {
|
||||
int r = memcmp(ptr_, x.ptr_, std::min(length_, x.length_));
|
||||
if (r == 0) {
|
||||
if (length_ < x.length_) r = -1;
|
||||
else if (length_ > x.length_) r = +1;
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
std::string as_string() const {
|
||||
return std::string(data(), size());
|
||||
}
|
||||
// We also define ToString() here, since many other string-like
|
||||
// interfaces name the routine that converts to a C++ string
|
||||
// "ToString", and it's confusing to have the method that does that
|
||||
// for a StringPiece be called "as_string()". We also leave the
|
||||
// "as_string()" method defined here for existing code.
|
||||
std::string ToString() const {
|
||||
return std::string(data(), size());
|
||||
}
|
||||
|
||||
void CopyToString(std::string* target) const;
|
||||
void AppendToString(std::string* target) const;
|
||||
|
||||
// Does "this" start with "x"
|
||||
bool starts_with(const StringPiece& x) const {
|
||||
return ((length_ >= x.length_) &&
|
||||
(memcmp(ptr_, x.ptr_, x.length_) == 0));
|
||||
}
|
||||
|
||||
// Does "this" end with "x"
|
||||
bool ends_with(const StringPiece& x) const {
|
||||
return ((length_ >= x.length_) &&
|
||||
(memcmp(ptr_ + (length_-x.length_), x.ptr_, x.length_) == 0));
|
||||
}
|
||||
|
||||
// standard STL container boilerplate
|
||||
typedef char value_type;
|
||||
typedef const char* pointer;
|
||||
typedef const char& reference;
|
||||
typedef const char& const_reference;
|
||||
typedef size_t size_type;
|
||||
typedef ptrdiff_t difference_type;
|
||||
static const size_type npos;
|
||||
typedef const char* const_iterator;
|
||||
typedef const char* iterator;
|
||||
typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
|
||||
typedef std::reverse_iterator<iterator> reverse_iterator;
|
||||
iterator begin() const { return ptr_; }
|
||||
iterator end() const { return ptr_ + length_; }
|
||||
const_reverse_iterator rbegin() const {
|
||||
return const_reverse_iterator(ptr_ + length_);
|
||||
}
|
||||
const_reverse_iterator rend() const {
|
||||
return const_reverse_iterator(ptr_);
|
||||
}
|
||||
// STLS says return size_type, but Google says return int
|
||||
int max_size() const { return length_; }
|
||||
int capacity() const { return length_; }
|
||||
|
||||
size_type copy(char* buf, size_type n, size_type pos = 0) const;
|
||||
|
||||
bool contains(StringPiece s) const;
|
||||
|
||||
size_type find(const StringPiece& s, size_type pos = 0) const;
|
||||
size_type find(char c, size_type pos = 0) const;
|
||||
size_type rfind(const StringPiece& s, size_type pos = npos) const;
|
||||
size_type rfind(char c, size_type pos = npos) const;
|
||||
|
||||
StringPiece substr(size_type pos, size_type n = npos) const;
|
||||
|
||||
static bool _equal(const StringPiece&, const StringPiece&);
|
||||
};
|
||||
|
||||
inline bool operator==(const StringPiece& x, const StringPiece& y) {
|
||||
return StringPiece::_equal(x, y);
|
||||
}
|
||||
|
||||
inline bool operator!=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x == y);
|
||||
}
|
||||
|
||||
inline bool operator<(const StringPiece& x, const StringPiece& y) {
|
||||
const int r = memcmp(x.data(), y.data(),
|
||||
std::min(x.size(), y.size()));
|
||||
return ((r < 0) || ((r == 0) && (x.size() < y.size())));
|
||||
}
|
||||
|
||||
inline bool operator>(const StringPiece& x, const StringPiece& y) {
|
||||
return y < x;
|
||||
}
|
||||
|
||||
inline bool operator<=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x > y);
|
||||
}
|
||||
|
||||
inline bool operator>=(const StringPiece& x, const StringPiece& y) {
|
||||
return !(x < y);
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
||||
// allow StringPiece to be logged
|
||||
extern std::ostream& operator<<(std::ostream& o, const re2::StringPiece& piece);
|
||||
|
||||
#endif // STRINGS_STRINGPIECE_H__
|
255
third_party/re2/re2/testing/backtrack.cc
vendored
255
third_party/re2/re2/testing/backtrack.cc
vendored
@ -1,255 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Tested by search_test.cc, exhaustive_test.cc, tester.cc
|
||||
//
|
||||
// Prog::BadSearchBacktrack is a backtracking regular expression search,
|
||||
// except that it remembers where it has been, trading a lot of
|
||||
// memory for a lot of time. It exists only for testing purposes.
|
||||
//
|
||||
// Let me repeat that.
|
||||
//
|
||||
// THIS CODE SHOULD NEVER BE USED IN PRODUCTION:
|
||||
// - It uses a ton of memory.
|
||||
// - It uses a ton of stack.
|
||||
// - It uses CHECK and LOG(FATAL).
|
||||
// - It implements unanchored search by repeated anchored search.
|
||||
//
|
||||
// On the other hand, it is very simple and a good reference
|
||||
// implementation for the more complicated regexp packages.
|
||||
//
|
||||
// In BUILD, this file is linked into the ":testing" library,
|
||||
// not the main library, in order to make it harder to pick up
|
||||
// accidentally.
|
||||
|
||||
#include "util/util.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Backtracker holds the state for a backtracking search.
|
||||
//
|
||||
// Excluding the search parameters, the main search state
|
||||
// is just the "capture registers", which record, for the
|
||||
// current execution, the string position at which each
|
||||
// parenthesis was passed. cap_[0] and cap_[1] are the
|
||||
// left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc.
|
||||
//
|
||||
// To avoid infinite loops during backtracking on expressions
|
||||
// like (a*)*, the visited_[] bitmap marks the (state, string-position)
|
||||
// pairs that have already been explored and are thus not worth
|
||||
// re-exploring if we get there via another path. Modern backtracking
|
||||
// libraries engineer their program representation differently, to make
|
||||
// such infinite loops possible to avoid without keeping a giant visited_
|
||||
// bitmap, but visited_ works fine for a reference implementation
|
||||
// and it has the nice benefit of making the search run in linear time.
|
||||
class Backtracker {
|
||||
public:
|
||||
explicit Backtracker(Prog* prog);
|
||||
~Backtracker();
|
||||
|
||||
bool Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch);
|
||||
|
||||
private:
|
||||
// Explores from instruction ip at string position p looking for a match.
|
||||
// Returns true if found (so that caller can stop trying other possibilities).
|
||||
bool Visit(int id, const char* p);
|
||||
|
||||
// Search parameters
|
||||
Prog* prog_; // program being run
|
||||
StringPiece text_; // text being searched
|
||||
StringPiece context_; // greater context of text being searched
|
||||
bool anchored_; // whether search is anchored at text.begin()
|
||||
bool longest_; // whether search wants leftmost-longest match
|
||||
bool endmatch_; // whether search must end at text.end()
|
||||
StringPiece *submatch_; // submatches to fill in
|
||||
int nsubmatch_; // # of submatches to fill in
|
||||
|
||||
// Search state
|
||||
const char* cap_[64]; // capture registers
|
||||
uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked
|
||||
size_t nvisited_; // # of words in bitmap
|
||||
};
|
||||
|
||||
Backtracker::Backtracker(Prog* prog)
|
||||
: prog_(prog),
|
||||
anchored_(false),
|
||||
longest_(false),
|
||||
endmatch_(false),
|
||||
submatch_(NULL),
|
||||
nsubmatch_(0),
|
||||
visited_(NULL),
|
||||
nvisited_(0) {
|
||||
}
|
||||
|
||||
Backtracker::~Backtracker() {
|
||||
delete[] visited_;
|
||||
}
|
||||
|
||||
// Runs a backtracking search.
|
||||
bool Backtracker::Search(const StringPiece& text, const StringPiece& context,
|
||||
bool anchored, bool longest,
|
||||
StringPiece* submatch, int nsubmatch) {
|
||||
text_ = text;
|
||||
context_ = context;
|
||||
if (context_.begin() == NULL)
|
||||
context_ = text;
|
||||
if (prog_->anchor_start() && text.begin() > context_.begin())
|
||||
return false;
|
||||
if (prog_->anchor_end() && text.end() < context_.end())
|
||||
return false;
|
||||
anchored_ = anchored | prog_->anchor_start();
|
||||
longest_ = longest | prog_->anchor_end();
|
||||
endmatch_ = prog_->anchor_end();
|
||||
submatch_ = submatch;
|
||||
nsubmatch_ = nsubmatch;
|
||||
CHECK(2*nsubmatch_ < arraysize(cap_));
|
||||
memset(cap_, 0, sizeof cap_);
|
||||
|
||||
// We use submatch_[0] for our own bookkeeping,
|
||||
// so it had better exist.
|
||||
StringPiece sp0;
|
||||
if (nsubmatch < 1) {
|
||||
submatch_ = &sp0;
|
||||
nsubmatch_ = 1;
|
||||
}
|
||||
submatch_[0] = NULL;
|
||||
|
||||
// Allocate new visited_ bitmap -- size is proportional
|
||||
// to text, so have to reallocate on each call to Search.
|
||||
delete[] visited_;
|
||||
nvisited_ = (prog_->size()*(text.size()+1) + 31)/32;
|
||||
visited_ = new uint32[nvisited_];
|
||||
memset(visited_, 0, nvisited_*sizeof visited_[0]);
|
||||
|
||||
// Anchored search must start at text.begin().
|
||||
if (anchored_) {
|
||||
cap_[0] = text.begin();
|
||||
return Visit(prog_->start(), text.begin());
|
||||
}
|
||||
|
||||
// Unanchored search, starting from each possible text position.
|
||||
// Notice that we have to try the empty string at the end of
|
||||
// the text, so the loop condition is p <= text.end(), not p < text.end().
|
||||
for (const char* p = text.begin(); p <= text.end(); p++) {
|
||||
cap_[0] = p;
|
||||
if (Visit(prog_->start(), p)) // Match must be leftmost; done.
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
// Explores from instruction ip at string position p looking for a match.
|
||||
// Return true if found (so that caller can stop trying other possibilities).
|
||||
bool Backtracker::Visit(int id, const char* p) {
|
||||
// Check bitmap. If we've already explored from here,
|
||||
// either it didn't match or it did but we're hoping for a better match.
|
||||
// Either way, don't go down that road again.
|
||||
CHECK(p <= text_.end());
|
||||
size_t n = id*(text_.size()+1) + (p - text_.begin());
|
||||
CHECK_LT(n/32, nvisited_);
|
||||
if (visited_[n/32] & (1 << (n&31)))
|
||||
return false;
|
||||
visited_[n/32] |= 1 << (n&31);
|
||||
|
||||
// Pick out byte at current position. If at end of string,
|
||||
// have to explore in hope of finishing a match. Use impossible byte -1.
|
||||
int c = -1;
|
||||
if (p < text_.end())
|
||||
c = *p & 0xFF;
|
||||
|
||||
Prog::Inst* ip = prog_->inst(id);
|
||||
switch (ip->opcode()) {
|
||||
default:
|
||||
LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode();
|
||||
return false; // not reached
|
||||
|
||||
case kInstAlt:
|
||||
case kInstAltMatch:
|
||||
// Try both possible next states: out is preferred to out1.
|
||||
if (Visit(ip->out(), p)) {
|
||||
if (longest_)
|
||||
Visit(ip->out1(), p);
|
||||
return true;
|
||||
}
|
||||
return Visit(ip->out1(), p);
|
||||
|
||||
case kInstByteRange:
|
||||
if (ip->Matches(c))
|
||||
return Visit(ip->out(), p+1);
|
||||
return false;
|
||||
|
||||
case kInstCapture:
|
||||
if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) {
|
||||
// Capture p to register, but save old value.
|
||||
const char* q = cap_[ip->cap()];
|
||||
cap_[ip->cap()] = p;
|
||||
bool ret = Visit(ip->out(), p);
|
||||
// Restore old value as we backtrack.
|
||||
cap_[ip->cap()] = q;
|
||||
return ret;
|
||||
}
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstEmptyWidth:
|
||||
if (ip->empty() & ~Prog::EmptyFlags(context_, p))
|
||||
return false;
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstNop:
|
||||
return Visit(ip->out(), p);
|
||||
|
||||
case kInstMatch:
|
||||
// We found a match. If it's the best so far, record the
|
||||
// parameters in the caller's submatch_ array.
|
||||
if (endmatch_ && p != context_.end())
|
||||
return false;
|
||||
cap_[1] = p;
|
||||
if (submatch_[0].data() == NULL || // First match so far ...
|
||||
(longest_ && p > submatch_[0].end())) { // ... or better match
|
||||
for (int i = 0; i < nsubmatch_; i++)
|
||||
submatch_[i].set(cap_[2*i],
|
||||
static_cast<int>(cap_[2*i+1] - cap_[2*i]));
|
||||
}
|
||||
return true;
|
||||
|
||||
case kInstFail:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// Runs a backtracking search.
|
||||
bool Prog::UnsafeSearchBacktrack(const StringPiece& text,
|
||||
const StringPiece& context,
|
||||
Anchor anchor,
|
||||
MatchKind kind,
|
||||
StringPiece* match,
|
||||
int nmatch) {
|
||||
// If full match, we ask for an anchored longest match
|
||||
// and then check that match[0] == text.
|
||||
// So make sure match[0] exists.
|
||||
StringPiece sp0;
|
||||
if (kind == kFullMatch) {
|
||||
anchor = kAnchored;
|
||||
if (nmatch < 1) {
|
||||
match = &sp0;
|
||||
nmatch = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Run the search.
|
||||
Backtracker b(this);
|
||||
bool anchored = anchor == kAnchored;
|
||||
bool longest = kind != kFirstMatch;
|
||||
if (!b.Search(text, context, anchored, longest, match, nmatch))
|
||||
return false;
|
||||
if (kind == kFullMatch && match[0].end() != text.end())
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace re2
|
223
third_party/re2/re2/testing/charclass_test.cc
vendored
223
third_party/re2/re2/testing/charclass_test.cc
vendored
@ -1,223 +0,0 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test character class manipulations.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
struct CCTest {
|
||||
struct {
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
} add[10];
|
||||
int remove;
|
||||
struct {
|
||||
Rune lo;
|
||||
Rune hi;
|
||||
} final[10];
|
||||
};
|
||||
|
||||
static CCTest tests[] = {
|
||||
{ { { 10, 20 }, {-1} }, -1,
|
||||
{ { 10, 20 }, {-1} } },
|
||||
|
||||
{ { { 10, 20 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 10, 30 }, {-1} } },
|
||||
|
||||
{ { { 10, 20 }, { 30, 40 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 10, 40 }, {-1} } },
|
||||
|
||||
{ { { 0, 50 }, { 20, 30 }, {-1} }, -1,
|
||||
{ { 0, 50 }, {-1} } },
|
||||
|
||||
{ { { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, {-1} }, -1,
|
||||
{ { 10, 11 }, { 13, 14 }, { 16, 17 }, { 19, 20 }, { 22, 23 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 5, 25 }, {-1} }, -1,
|
||||
{ { 5, 25 }, {-1} } },
|
||||
|
||||
{ { { 13, 14 }, { 10, 11 }, { 22, 23 }, { 19, 20 }, { 16, 17 }, { 12, 21 }, {-1} }, -1,
|
||||
{ { 10, 23 }, {-1} } },
|
||||
|
||||
// These check boundary cases during negation.
|
||||
{ { { 0, Runemax }, {-1} }, -1,
|
||||
{ { 0, Runemax }, {-1} } },
|
||||
|
||||
{ { { 0, 50 }, {-1} }, -1,
|
||||
{ { 0, 50 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, -1,
|
||||
{ { 50, Runemax }, {-1} } },
|
||||
|
||||
// Check RemoveAbove.
|
||||
{ { { 50, Runemax }, {-1} }, 255,
|
||||
{ { 50, 255 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, 65535,
|
||||
{ { 50, 65535 }, {-1} } },
|
||||
|
||||
{ { { 50, Runemax }, {-1} }, Runemax,
|
||||
{ { 50, Runemax }, {-1} } },
|
||||
|
||||
{ { { 50, 60 }, { 250, 260 }, { 350, 360 }, {-1} }, 255,
|
||||
{ { 50, 60 }, { 250, 255 }, {-1} } },
|
||||
|
||||
{ { { 50, 60 }, {-1} }, 255,
|
||||
{ { 50, 60 }, {-1} } },
|
||||
|
||||
{ { { 350, 360 }, {-1} }, 255,
|
||||
{ {-1} } },
|
||||
|
||||
{ { {-1} }, 255,
|
||||
{ {-1} } },
|
||||
};
|
||||
|
||||
template<class CharClass>
|
||||
static void Broke(const char *desc, const CCTest* t, CharClass* cc) {
|
||||
if (t == NULL) {
|
||||
printf("\t%s:", desc);
|
||||
} else {
|
||||
printf("\n");
|
||||
printf("CharClass added: [%s]", desc);
|
||||
for (int k = 0; t->add[k].lo >= 0; k++)
|
||||
printf(" %d-%d", t->add[k].lo, t->add[k].hi);
|
||||
printf("\n");
|
||||
if (t->remove >= 0)
|
||||
printf("Removed > %d\n", t->remove);
|
||||
printf("\twant:");
|
||||
for (int k = 0; t->final[k].lo >= 0; k++)
|
||||
printf(" %d-%d", t->final[k].lo, t->final[k].hi);
|
||||
printf("\n");
|
||||
printf("\thave:");
|
||||
}
|
||||
|
||||
for (typename CharClass::iterator it = cc->begin(); it != cc->end(); ++it)
|
||||
printf(" %d-%d", it->lo, it->hi);
|
||||
printf("\n");
|
||||
}
|
||||
|
||||
bool ShouldContain(CCTest *t, int x) {
|
||||
for (int j = 0; t->final[j].lo >= 0; j++)
|
||||
if (t->final[j].lo <= x && x <= t->final[j].hi)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Helpers to make templated CorrectCC work with both CharClass and CharClassBuilder.
|
||||
|
||||
CharClass* Negate(CharClass *cc) {
|
||||
return cc->Negate();
|
||||
}
|
||||
|
||||
void Delete(CharClass* cc) {
|
||||
cc->Delete();
|
||||
}
|
||||
|
||||
CharClassBuilder* Negate(CharClassBuilder* cc) {
|
||||
CharClassBuilder* ncc = cc->Copy();
|
||||
ncc->Negate();
|
||||
return ncc;
|
||||
}
|
||||
|
||||
void Delete(CharClassBuilder* cc) {
|
||||
delete cc;
|
||||
}
|
||||
|
||||
template<class CharClass>
|
||||
bool CorrectCC(CharClass *cc, CCTest *t, const char *desc) {
|
||||
typename CharClass::iterator it = cc->begin();
|
||||
int size = 0;
|
||||
for (int j = 0; t->final[j].lo >= 0; j++, ++it) {
|
||||
if (it == cc->end() ||
|
||||
it->lo != t->final[j].lo ||
|
||||
it->hi != t->final[j].hi) {
|
||||
Broke(desc, t, cc);
|
||||
return false;
|
||||
}
|
||||
size += it->hi - it->lo + 1;
|
||||
}
|
||||
if (it != cc->end()) {
|
||||
Broke(desc, t, cc);
|
||||
return false;
|
||||
}
|
||||
if (cc->size() != size) {
|
||||
Broke(desc, t, cc);
|
||||
printf("wrong size: want %d have %d\n", size, cc->size());
|
||||
return false;
|
||||
}
|
||||
|
||||
for (int j = 0; j < 101; j++) {
|
||||
if (j == 100)
|
||||
j = Runemax;
|
||||
if (ShouldContain(t, j) != cc->Contains(j)) {
|
||||
Broke(desc, t, cc);
|
||||
printf("want contains(%d)=%d, got %d\n",
|
||||
j, ShouldContain(t, j), cc->Contains(j));
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
CharClass* ncc = Negate(cc);
|
||||
for (int j = 0; j < 101; j++) {
|
||||
if (j == 100)
|
||||
j = Runemax;
|
||||
if (ShouldContain(t, j) == ncc->Contains(j)) {
|
||||
Broke(desc, t, cc);
|
||||
Broke("ncc", NULL, ncc);
|
||||
printf("want ncc contains(%d)!=%d, got %d\n",
|
||||
j, ShouldContain(t, j), ncc->Contains(j));
|
||||
Delete(ncc);
|
||||
return false;
|
||||
}
|
||||
if (ncc->size() != Runemax+1 - cc->size()) {
|
||||
Broke(desc, t, cc);
|
||||
Broke("ncc", NULL, ncc);
|
||||
printf("ncc size should be %d is %d\n",
|
||||
Runemax+1 - cc->size(), ncc->size());
|
||||
Delete(ncc);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
Delete(ncc);
|
||||
return true;
|
||||
}
|
||||
|
||||
TEST(TestCharClassBuilder, Adds) {
|
||||
int nfail = 0;
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
CharClassBuilder ccb;
|
||||
CCTest* t = &tests[i];
|
||||
for (int j = 0; t->add[j].lo >= 0; j++)
|
||||
ccb.AddRange(t->add[j].lo, t->add[j].hi);
|
||||
if (t->remove >= 0)
|
||||
ccb.RemoveAbove(t->remove);
|
||||
if (!CorrectCC(&ccb, t, "before copy (CharClassBuilder)"))
|
||||
nfail++;
|
||||
CharClass* cc = ccb.GetCharClass();
|
||||
if (!CorrectCC(cc, t, "before copy (CharClass)"))
|
||||
nfail++;
|
||||
cc->Delete();
|
||||
|
||||
CharClassBuilder *ccb1 = ccb.Copy();
|
||||
if (!CorrectCC(ccb1, t, "after copy (CharClassBuilder)"))
|
||||
nfail++;
|
||||
cc = ccb.GetCharClass();
|
||||
if (!CorrectCC(cc, t, "after copy (CharClass)"))
|
||||
nfail++;
|
||||
cc->Delete();
|
||||
delete ccb1;
|
||||
}
|
||||
EXPECT_EQ(nfail, 0);
|
||||
}
|
||||
|
||||
} // namespace re2
|
175
third_party/re2/re2/testing/compile_test.cc
vendored
175
third_party/re2/re2/testing/compile_test.cc
vendored
@ -1,175 +0,0 @@
|
||||
// Copyright 2007 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Test prog.cc, compile.cc
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/prog.h"
|
||||
|
||||
DEFINE_string(show, "", "regular expression to compile and dump");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Simple input/output tests checking that
|
||||
// the regexp compiles to the expected code.
|
||||
// These are just to sanity check the basic implementation.
|
||||
// The real confidence tests happen by testing the NFA/DFA
|
||||
// that run the compiled code.
|
||||
|
||||
struct Test {
|
||||
const char* regexp;
|
||||
const char* code;
|
||||
};
|
||||
|
||||
static Test tests[] = {
|
||||
{ "a",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "ab",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. byte [62-62] -> 3\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a|c",
|
||||
"3. alt -> 1 | 2\n"
|
||||
"1. byte [61-61] -> 4\n"
|
||||
"2. byte [63-63] -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "a|b",
|
||||
"1. byte [61-62] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "[ab]",
|
||||
"1. byte [61-62] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "a+",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. alt -> 1 | 3\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a+?",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. alt -> 3 | 1\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a*",
|
||||
"2. alt -> 1 | 3\n"
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a*?",
|
||||
"2. alt -> 3 | 1\n"
|
||||
"3. match! 0\n"
|
||||
"1. byte [61-61] -> 2\n" },
|
||||
{ "a?",
|
||||
"2. alt -> 1 | 3\n"
|
||||
"1. byte [61-61] -> 3\n"
|
||||
"3. match! 0\n" },
|
||||
{ "a??",
|
||||
"2. alt -> 3 | 1\n"
|
||||
"3. match! 0\n"
|
||||
"1. byte [61-61] -> 3\n" },
|
||||
{ "a{4}",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. byte [61-61] -> 3\n"
|
||||
"3. byte [61-61] -> 4\n"
|
||||
"4. byte [61-61] -> 5\n"
|
||||
"5. match! 0\n" },
|
||||
{ "(a)",
|
||||
"2. capture 2 -> 1\n"
|
||||
"1. byte [61-61] -> 3\n"
|
||||
"3. capture 3 -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "(?:a)",
|
||||
"1. byte [61-61] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
{ "",
|
||||
"2. match! 0\n" },
|
||||
{ ".",
|
||||
"3. alt -> 1 | 2\n"
|
||||
"1. byte [00-09] -> 4\n"
|
||||
"2. byte [0b-ff] -> 4\n"
|
||||
"4. match! 0\n" },
|
||||
{ "[^ab]",
|
||||
"5. alt -> 3 | 4\n"
|
||||
"3. alt -> 1 | 2\n"
|
||||
"4. byte [63-ff] -> 6\n"
|
||||
"1. byte [00-09] -> 6\n"
|
||||
"2. byte [0b-60] -> 6\n"
|
||||
"6. match! 0\n" },
|
||||
{ "[Aa]",
|
||||
"1. byte/i [61-61] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
// Issue 20992936
|
||||
{ "[[-`]",
|
||||
"1. byte [5b-60] -> 2\n"
|
||||
"2. match! 0\n" },
|
||||
};
|
||||
|
||||
TEST(TestRegexpCompileToProg, Simple) {
|
||||
int failed = 0;
|
||||
for (int i = 0; i < arraysize(tests); i++) {
|
||||
const re2::Test& t = tests[i];
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
|
||||
if (re == NULL) {
|
||||
LOG(ERROR) << "Cannot parse: " << t.regexp;
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
if (prog == NULL) {
|
||||
LOG(ERROR) << "Cannot compile: " << t.regexp;
|
||||
re->Decref();
|
||||
failed++;
|
||||
continue;
|
||||
}
|
||||
CHECK(re->CompileToProg(1) == NULL);
|
||||
string s = prog->Dump();
|
||||
if (s != t.code) {
|
||||
LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
|
||||
LOG(ERROR) << "Want:\n" << t.code;
|
||||
LOG(ERROR) << "Got:\n" << s;
|
||||
failed++;
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
EXPECT_EQ(failed, 0);
|
||||
}
|
||||
|
||||
// The distinct byte ranges involved in the UTF-8 dot ([^\n]).
|
||||
// Once, erroneously split between 0x3f and 0x40 because it is
|
||||
// a 6-bit boundary.
|
||||
static struct UTF8ByteRange {
|
||||
int lo;
|
||||
int hi;
|
||||
} utf8ranges[] = {
|
||||
{ 0x00, 0x09 },
|
||||
{ 0x0A, 0x0A },
|
||||
{ 0x10, 0x7F },
|
||||
{ 0x80, 0x8F },
|
||||
{ 0x90, 0x9F },
|
||||
{ 0xA0, 0xBF },
|
||||
{ 0xC0, 0xC1 },
|
||||
{ 0xC2, 0xDF },
|
||||
{ 0xE0, 0xE0 },
|
||||
{ 0xE1, 0xEF },
|
||||
{ 0xF0, 0xF0 },
|
||||
{ 0xF1, 0xF3 },
|
||||
{ 0xF4, 0xF4 },
|
||||
{ 0xF5, 0xFF },
|
||||
};
|
||||
|
||||
TEST(TestCompile, ByteRanges) {
|
||||
Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
|
||||
EXPECT_TRUE(re != NULL);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
EXPECT_TRUE(prog != NULL);
|
||||
EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
|
||||
for (int i = 0; i < arraysize(utf8ranges); i++)
|
||||
for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
|
||||
EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
} // namespace re2
|
348
third_party/re2/re2/testing/dfa_test.cc
vendored
348
third_party/re2/re2/testing/dfa_test.cc
vendored
@ -1,348 +0,0 @@
|
||||
// Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#include "util/thread.h"
|
||||
#include "util/test.h"
|
||||
#include "re2/prog.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/regexp.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
static const bool UsingMallocCounter = false;
|
||||
|
||||
DECLARE_bool(re2_dfa_bail_when_slow);
|
||||
|
||||
DEFINE_int32(size, 8, "log2(number of DFA nodes)");
|
||||
DEFINE_int32(repeat, 2, "Repetition count.");
|
||||
DEFINE_int32(threads, 4, "number of threads");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Check that multithreaded access to DFA class works.
|
||||
|
||||
// Helper thread: builds entire DFA for prog.
|
||||
class BuildThread : public Thread {
|
||||
public:
|
||||
BuildThread(Prog* prog) : prog_(prog) {}
|
||||
virtual void Run() {
|
||||
CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
|
||||
}
|
||||
|
||||
private:
|
||||
Prog* prog_;
|
||||
};
|
||||
|
||||
TEST(Multithreaded, BuildEntireDFA) {
|
||||
// Create regexp with 2^FLAGS_size states in DFA.
|
||||
string s = "a";
|
||||
for (int i = 0; i < FLAGS_size; i++)
|
||||
s += "[ab]";
|
||||
s += "b";
|
||||
|
||||
// Check that single-threaded code works.
|
||||
{
|
||||
//LOG(INFO) << s;
|
||||
Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
CHECK(prog);
|
||||
BuildThread* t = new BuildThread(prog);
|
||||
t->SetJoinable(true);
|
||||
t->Start();
|
||||
t->Join();
|
||||
delete t;
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Build the DFA simultaneously in a bunch of threads.
|
||||
for (int i = 0; i < FLAGS_repeat; i++) {
|
||||
Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
Prog* prog = re->CompileToProg(0);
|
||||
CHECK(prog);
|
||||
|
||||
vector<BuildThread*> threads;
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
BuildThread *t = new BuildThread(prog);
|
||||
t->SetJoinable(true);
|
||||
threads.push_back(t);
|
||||
}
|
||||
for (int j = 0; j < FLAGS_threads; j++)
|
||||
threads[j]->Start();
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
threads[j]->Join();
|
||||
delete threads[j];
|
||||
}
|
||||
|
||||
// One more compile, to make sure everything is okay.
|
||||
prog->BuildEntireDFA(Prog::kFirstMatch);
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
}
|
||||
|
||||
// Check that DFA size requirements are followed.
|
||||
// BuildEntireDFA will, like SearchDFA, stop building out
|
||||
// the DFA once the memory limits are reached.
|
||||
TEST(SingleThreaded, BuildEntireDFA) {
|
||||
// Create regexp with 2^30 states in DFA.
|
||||
string s = "a";
|
||||
for (int i = 0; i < 30; i++)
|
||||
s += "[ab]";
|
||||
s += "b";
|
||||
|
||||
Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
int max = 24;
|
||||
for (int i = 17; i < max; i++) {
|
||||
int64 limit = 1<<i;
|
||||
int64 usage;
|
||||
//int64 progusage, dfamem;
|
||||
{
|
||||
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
|
||||
Prog* prog = re->CompileToProg(limit);
|
||||
CHECK(prog);
|
||||
//progusage = m.HeapGrowth();
|
||||
//dfamem = prog->dfa_mem();
|
||||
prog->BuildEntireDFA(Prog::kFirstMatch);
|
||||
prog->BuildEntireDFA(Prog::kLongestMatch);
|
||||
usage = m.HeapGrowth();
|
||||
delete prog;
|
||||
}
|
||||
if (!UsingMallocCounter)
|
||||
continue;
|
||||
//LOG(INFO) << "limit " << limit << ", "
|
||||
// << "prog usage " << progusage << ", "
|
||||
// << "DFA budget " << dfamem << ", "
|
||||
// << "total " << usage;
|
||||
// Tolerate +/- 10%.
|
||||
CHECK_GT(usage, limit*9/10);
|
||||
CHECK_LT(usage, limit*11/10);
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Generates and returns a string over binary alphabet {0,1} that contains
|
||||
// all possible binary sequences of length n as subsequences. The obvious
|
||||
// brute force method would generate a string of length n * 2^n, but this
|
||||
// generates a string of length n + 2^n - 1 called a De Bruijn cycle.
|
||||
// See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
|
||||
// Such a string is useful for testing a DFA. If you have a DFA
|
||||
// where distinct last n bytes implies distinct states, then running on a
|
||||
// DeBruijn string causes the DFA to need to create a new state at every
|
||||
// position in the input, never reusing any states until it gets to the
|
||||
// end of the string. This is the worst possible case for DFA execution.
|
||||
static string DeBruijnString(int n) {
|
||||
CHECK_LT(n, static_cast<int>(8*sizeof(int)));
|
||||
CHECK_GT(n, 0);
|
||||
|
||||
vector<bool> did(1<<n);
|
||||
for (int i = 0; i < 1<<n; i++)
|
||||
did[i] = false;
|
||||
|
||||
string s;
|
||||
for (int i = 0; i < n-1; i++)
|
||||
s.append("0");
|
||||
int bits = 0;
|
||||
int mask = (1<<n) - 1;
|
||||
for (int i = 0; i < (1<<n); i++) {
|
||||
bits <<= 1;
|
||||
bits &= mask;
|
||||
if (!did[bits|1]) {
|
||||
bits |= 1;
|
||||
s.append("1");
|
||||
} else {
|
||||
s.append("0");
|
||||
}
|
||||
CHECK(!did[bits]);
|
||||
did[bits] = true;
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
// Test that the DFA gets the right result even if it runs
|
||||
// out of memory during a search. The regular expression
|
||||
// 0[01]{n}$ matches a binary string of 0s and 1s only if
|
||||
// the (n+1)th-to-last character is a 0. Matching this in
|
||||
// a single forward pass (as done by the DFA) requires
|
||||
// keeping one bit for each of the last n+1 characters
|
||||
// (whether each was a 0), or 2^(n+1) possible states.
|
||||
// If we run this regexp to search in a string that contains
|
||||
// every possible n-character binary string as a substring,
|
||||
// then it will have to run through at least 2^n states.
|
||||
// States are big data structures -- certainly more than 1 byte --
|
||||
// so if the DFA can search correctly while staying within a
|
||||
// 2^n byte limit, it must be handling out-of-memory conditions
|
||||
// gracefully.
|
||||
TEST(SingleThreaded, SearchDFA) {
|
||||
// Choice of n is mostly arbitrary, except that:
|
||||
// * making n too big makes the test run for too long.
|
||||
// * making n too small makes the DFA refuse to run,
|
||||
// because it has so little memory compared to the program size.
|
||||
// Empirically, n = 18 is a good compromise between the two.
|
||||
const int n = 18;
|
||||
|
||||
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
|
||||
Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
|
||||
// The De Bruijn string for n ends with a 1 followed by n 0s in a row,
|
||||
// which is not a match for 0[01]{n}$. Adding one more 0 is a match.
|
||||
string no_match = DeBruijnString(n);
|
||||
string match = no_match + "0";
|
||||
|
||||
// The De Bruijn string is the worst case input for this regexp.
|
||||
// By default, the DFA will notice that it is flushing its cache
|
||||
// too frequently and will bail out early, so that RE2 can use the
|
||||
// NFA implementation instead. (The DFA loses its speed advantage
|
||||
// if it can't get a good cache hit rate.)
|
||||
// Tell the DFA to trudge along instead.
|
||||
FLAGS_re2_dfa_bail_when_slow = false;
|
||||
|
||||
int64 usage;
|
||||
int64 peak_usage;
|
||||
{
|
||||
testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
CHECK(prog);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
bool matched, failed = false;
|
||||
matched = prog->SearchDFA(match, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(matched);
|
||||
matched = prog->SearchDFA(no_match, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(!matched);
|
||||
}
|
||||
usage = m.HeapGrowth();
|
||||
peak_usage = m.PeakHeapGrowth();
|
||||
delete prog;
|
||||
}
|
||||
if (!UsingMallocCounter)
|
||||
return;
|
||||
//LOG(INFO) << "usage " << usage << ", "
|
||||
// << "peak usage " << peak_usage;
|
||||
CHECK_LT(usage, 1<<n);
|
||||
CHECK_LT(peak_usage, 1<<n);
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
// Helper thread: searches for match, which should match,
|
||||
// and no_match, which should not.
|
||||
class SearchThread : public Thread {
|
||||
public:
|
||||
SearchThread(Prog* prog, const StringPiece& match,
|
||||
const StringPiece& no_match)
|
||||
: prog_(prog), match_(match), no_match_(no_match) {}
|
||||
|
||||
virtual void Run() {
|
||||
for (int i = 0; i < 2; i++) {
|
||||
bool matched, failed = false;
|
||||
matched = prog_->SearchDFA(match_, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(matched);
|
||||
matched = prog_->SearchDFA(no_match_, NULL,
|
||||
Prog::kUnanchored, Prog::kFirstMatch,
|
||||
NULL, &failed, NULL);
|
||||
CHECK(!failed);
|
||||
CHECK(!matched);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
Prog* prog_;
|
||||
StringPiece match_;
|
||||
StringPiece no_match_;
|
||||
};
|
||||
|
||||
TEST(Multithreaded, SearchDFA) {
|
||||
// Same as single-threaded test above.
|
||||
const int n = 18;
|
||||
Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
|
||||
Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
string no_match = DeBruijnString(n);
|
||||
string match = no_match + "0";
|
||||
FLAGS_re2_dfa_bail_when_slow = false;
|
||||
|
||||
// Check that single-threaded code works.
|
||||
{
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
CHECK(prog);
|
||||
SearchThread* t = new SearchThread(prog, match, no_match);
|
||||
t->SetJoinable(true);
|
||||
t->Start();
|
||||
t->Join();
|
||||
delete t;
|
||||
delete prog;
|
||||
}
|
||||
|
||||
// Run the search simultaneously in a bunch of threads.
|
||||
// Reuse same flags for Multithreaded.BuildDFA above.
|
||||
for (int i = 0; i < FLAGS_repeat; i++) {
|
||||
//LOG(INFO) << "Search " << i;
|
||||
Prog* prog = re->CompileToProg(1<<n);
|
||||
CHECK(prog);
|
||||
|
||||
vector<SearchThread*> threads;
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
SearchThread *t = new SearchThread(prog, match, no_match);
|
||||
t->SetJoinable(true);
|
||||
threads.push_back(t);
|
||||
}
|
||||
for (int j = 0; j < FLAGS_threads; j++)
|
||||
threads[j]->Start();
|
||||
for (int j = 0; j < FLAGS_threads; j++) {
|
||||
threads[j]->Join();
|
||||
delete threads[j];
|
||||
}
|
||||
delete prog;
|
||||
}
|
||||
re->Decref();
|
||||
}
|
||||
|
||||
struct ReverseTest {
|
||||
const char *regexp;
|
||||
const char *text;
|
||||
bool match;
|
||||
};
|
||||
|
||||
// Test that reverse DFA handles anchored/unanchored correctly.
|
||||
// It's in the DFA interface but not used by RE2.
|
||||
ReverseTest reverse_tests[] = {
|
||||
{ "\\A(a|b)", "abc", true },
|
||||
{ "(a|b)\\z", "cba", true },
|
||||
{ "\\A(a|b)", "cba", false },
|
||||
{ "(a|b)\\z", "abc", false },
|
||||
};
|
||||
|
||||
TEST(DFA, ReverseMatch) {
|
||||
int nfail = 0;
|
||||
for (int i = 0; i < arraysize(reverse_tests); i++) {
|
||||
const ReverseTest& t = reverse_tests[i];
|
||||
Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
|
||||
CHECK(re);
|
||||
Prog *prog = re->CompileToReverseProg(0);
|
||||
CHECK(prog);
|
||||
bool failed = false;
|
||||
bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
|
||||
if (matched != t.match) {
|
||||
LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
|
||||
nfail++;
|
||||
}
|
||||
delete prog;
|
||||
re->Decref();
|
||||
}
|
||||
EXPECT_EQ(nfail, 0);
|
||||
}
|
||||
|
||||
} // namespace re2
|
166
third_party/re2/re2/testing/dump.cc
vendored
166
third_party/re2/re2/testing/dump.cc
vendored
@ -1,166 +0,0 @@
|
||||
// Copyright 2006 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Dump the regexp into a string showing structure.
|
||||
// Tested by parse_unittest.cc
|
||||
|
||||
// This function traverses the regexp recursively,
|
||||
// meaning that on inputs like Regexp::Simplify of
|
||||
// a{100}{100}{100}{100}{100}{100}{100}{100}{100}{100},
|
||||
// it takes time and space exponential in the size of the
|
||||
// original regular expression. It can also use stack space
|
||||
// linear in the size of the regular expression for inputs
|
||||
// like ((((((((((((((((a*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*)*.
|
||||
// IT IS NOT SAFE TO CALL FROM PRODUCTION CODE.
|
||||
// As a result, Dump is provided only in the testing
|
||||
// library (see BUILD).
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/test.h"
|
||||
#include "re2/stringpiece.h"
|
||||
#include "re2/regexp.h"
|
||||
|
||||
// Cause a link error if this file is used outside of testing.
|
||||
DECLARE_string(test_tmpdir);
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static const char* kOpcodeNames[] = {
|
||||
"bad",
|
||||
"no",
|
||||
"emp",
|
||||
"lit",
|
||||
"str",
|
||||
"cat",
|
||||
"alt",
|
||||
"star",
|
||||
"plus",
|
||||
"que",
|
||||
"rep",
|
||||
"cap",
|
||||
"dot",
|
||||
"byte",
|
||||
"bol",
|
||||
"eol",
|
||||
"wb", // kRegexpWordBoundary
|
||||
"nwb", // kRegexpNoWordBoundary
|
||||
"bot",
|
||||
"eot",
|
||||
"cc",
|
||||
"match",
|
||||
};
|
||||
|
||||
// Create string representation of regexp with explicit structure.
|
||||
// Nothing pretty, just for testing.
|
||||
static void DumpRegexpAppending(Regexp* re, string* s) {
|
||||
if (re->op() < 0 || re->op() >= arraysize(kOpcodeNames)) {
|
||||
StringAppendF(s, "op%d", re->op());
|
||||
} else {
|
||||
switch (re->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
case kRegexpRepeat:
|
||||
if (re->parse_flags() & Regexp::NonGreedy)
|
||||
s->append("n");
|
||||
break;
|
||||
}
|
||||
s->append(kOpcodeNames[re->op()]);
|
||||
if (re->op() == kRegexpLiteral && (re->parse_flags() & Regexp::FoldCase)) {
|
||||
Rune r = re->rune();
|
||||
if ('a' <= r && r <= 'z')
|
||||
s->append("fold");
|
||||
}
|
||||
if (re->op() == kRegexpLiteralString && (re->parse_flags() & Regexp::FoldCase)) {
|
||||
for (int i = 0; i < re->nrunes(); i++) {
|
||||
Rune r = re->runes()[i];
|
||||
if ('a' <= r && r <= 'z') {
|
||||
s->append("fold");
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
s->append("{");
|
||||
switch (re->op()) {
|
||||
default:
|
||||
break;
|
||||
case kRegexpEndText:
|
||||
if (!(re->parse_flags() & Regexp::WasDollar)) {
|
||||
s->append("\\z");
|
||||
}
|
||||
break;
|
||||
case kRegexpLiteral: {
|
||||
Rune r = re->rune();
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
s->append(buf);
|
||||
break;
|
||||
}
|
||||
case kRegexpLiteralString:
|
||||
for (int i = 0; i < re->nrunes(); i++) {
|
||||
Rune r = re->runes()[i];
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
s->append(buf);
|
||||
}
|
||||
break;
|
||||
case kRegexpConcat:
|
||||
case kRegexpAlternate:
|
||||
for (int i = 0; i < re->nsub(); i++)
|
||||
DumpRegexpAppending(re->sub()[i], s);
|
||||
break;
|
||||
case kRegexpStar:
|
||||
case kRegexpPlus:
|
||||
case kRegexpQuest:
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpCapture:
|
||||
if (re->cap() == 0)
|
||||
LOG(DFATAL) << "kRegexpCapture cap() == 0";
|
||||
if (re->name()) {
|
||||
s->append(*re->name());
|
||||
s->append(":");
|
||||
}
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpRepeat:
|
||||
s->append(StringPrintf("%d,%d ", re->min(), re->max()));
|
||||
DumpRegexpAppending(re->sub()[0], s);
|
||||
break;
|
||||
case kRegexpCharClass: {
|
||||
string sep;
|
||||
for (CharClass::iterator it = re->cc()->begin();
|
||||
it != re->cc()->end(); ++it) {
|
||||
RuneRange rr = *it;
|
||||
s->append(sep);
|
||||
if (rr.lo == rr.hi)
|
||||
s->append(StringPrintf("%#x", rr.lo));
|
||||
else
|
||||
s->append(StringPrintf("%#x-%#x", rr.lo, rr.hi));
|
||||
sep = " ";
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
s->append("}");
|
||||
}
|
||||
|
||||
string Regexp::Dump() {
|
||||
string s;
|
||||
|
||||
// Make sure being called from a unit test.
|
||||
if (FLAGS_test_tmpdir.empty()) {
|
||||
LOG(ERROR) << "Cannot use except for testing.";
|
||||
return s;
|
||||
}
|
||||
|
||||
DumpRegexpAppending(this, &s);
|
||||
return s;
|
||||
}
|
||||
|
||||
} // namespace re2
|
42
third_party/re2/re2/testing/exhaustive1_test.cc
vendored
42
third_party/re2/re2/testing/exhaustive1_test.cc
vendored
@ -1,42 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
DECLARE_string(regexp_engines);
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test simple repetition operators
|
||||
TEST(Repetition, Simple) {
|
||||
vector<string> ops = Split(" ",
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
|
||||
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
|
||||
"%s* %s+ %s? %s*? %s+? %s??");
|
||||
ExhaustiveTest(3, 2, Explode("abc."), ops,
|
||||
6, Explode("ab"), "(?:%s)", "");
|
||||
ExhaustiveTest(3, 2, Explode("abc."), ops,
|
||||
40, Explode("a"), "(?:%s)", "");
|
||||
}
|
||||
|
||||
// Test capturing parens -- (a) -- inside repetition operators
|
||||
TEST(Repetition, Capturing) {
|
||||
vector<string> ops = Split(" ",
|
||||
"%s{0} %s{0,} %s{1} %s{1,} %s{0,1} %s{0,2} "
|
||||
"%s{1,2} %s{2} %s{2,} %s{3,4} %s{4,5} "
|
||||
"%s* %s+ %s? %s*? %s+? %s??");
|
||||
ExhaustiveTest(3, 2, Split(" ", "a (a) b"), ops,
|
||||
7, Explode("ab"), "(?:%s)", "");
|
||||
|
||||
// This would be a great test, but it runs forever when PCRE is enabled.
|
||||
if (strstr("PCRE", FLAGS_regexp_engines.c_str()) == NULL)
|
||||
ExhaustiveTest(4, 3, Split(" ", "a (a)"), ops,
|
||||
100, Explode("a"), "(?:%s)", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
70
third_party/re2/re2/testing/exhaustive2_test.cc
vendored
70
third_party/re2/re2/testing/exhaustive2_test.cc
vendored
@ -1,70 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/re2.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
DECLARE_string(regexp_engines);
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test empty string matches (aka "(?:)")
|
||||
TEST(EmptyString, Exhaustive) {
|
||||
ExhaustiveTest(2, 2, Split(" ", "(?:) a"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
5, Split("", "ab"), "", "");
|
||||
}
|
||||
|
||||
// Test escaped versions of regexp syntax.
|
||||
TEST(Punctuation, Literals) {
|
||||
vector<string> alphabet = Explode("()*+?{}[]\\^$.");
|
||||
vector<string> escaped = alphabet;
|
||||
for (size_t i = 0; i < escaped.size(); i++)
|
||||
escaped[i] = "\\" + escaped[i];
|
||||
ExhaustiveTest(1, 1, escaped, RegexpGenerator::EgrepOps(),
|
||||
2, alphabet, "", "");
|
||||
}
|
||||
|
||||
// Test ^ $ . \A \z in presence of line endings.
|
||||
// Have to wrap the empty-width ones in (?:) so that
|
||||
// they can be repeated -- PCRE rejects ^* but allows (?:^)*
|
||||
TEST(LineEnds, Exhaustive) {
|
||||
ExhaustiveTest(2, 2, Split(" ", "(?:^) (?:$) . a \\n (?:\\A) (?:\\z)"),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
4, Explode("ab\n"), "", "");
|
||||
}
|
||||
|
||||
// Test what does and does not match \n.
|
||||
// This would be a good test, except that PCRE seems to have a bug:
|
||||
// in single-byte character set mode (the default),
|
||||
// [^a] matches \n, but in UTF-8 mode it does not.
|
||||
// So when we run the test, the tester complains that
|
||||
// we don't agree with PCRE, but it's PCRE that is at fault.
|
||||
// For what it's worth, Perl gets this right (matches
|
||||
// regardless of whether UTF-8 input is selected):
|
||||
//
|
||||
// #!/usr/bin/perl
|
||||
// use POSIX qw(locale_h);
|
||||
// print "matches in latin1\n" if "\n" =~ /[^a]/;
|
||||
// setlocale("en_US.utf8");
|
||||
// print "matches in utf8\n" if "\n" =~ /[^a]/;
|
||||
//
|
||||
// The rule chosen for RE2 is that by default, like Perl,
|
||||
// dot does not match \n but negated character classes [^a] do.
|
||||
// (?s) will allow dot to match \n; there is no way in RE2
|
||||
// to stop [^a] from matching \n, though the underlying library
|
||||
// provides a mechanism, and RE2 could add new syntax if needed.
|
||||
//
|
||||
// TEST(Newlines, Exhaustive) {
|
||||
// vector<string> empty_vector;
|
||||
// ExhaustiveTest(1, 1, Split(" ", "\\n . a [^a]"),
|
||||
// RegexpGenerator::EgrepOps(),
|
||||
// 4, Explode("a\n"), "");
|
||||
// }
|
||||
|
||||
} // namespace re2
|
||||
|
94
third_party/re2/re2/testing/exhaustive3_test.cc
vendored
94
third_party/re2/re2/testing/exhaustive3_test.cc
vendored
@ -1,94 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
// Test simple character classes by themselves.
|
||||
TEST(CharacterClasses, Exhaustive) {
|
||||
vector<string> atoms = Split(" ",
|
||||
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
|
||||
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
|
||||
5, Explode("ab"), "", "");
|
||||
}
|
||||
|
||||
// Test simple character classes inside a___b (for example, a[a]b).
|
||||
TEST(CharacterClasses, ExhaustiveAB) {
|
||||
vector<string> atoms = Split(" ",
|
||||
"[a] [b] [ab] [^bc] [b-d] [^b-d] []a] [-a] [a-] [^-a] [a-b-c] a b .");
|
||||
ExhaustiveTest(2, 1, atoms, RegexpGenerator::EgrepOps(),
|
||||
5, Explode("ab"), "a%sb", "");
|
||||
}
|
||||
|
||||
// Returns UTF8 for Rune r
|
||||
static string UTF8(Rune r) {
|
||||
char buf[UTFmax+1];
|
||||
buf[runetochar(buf, &r)] = 0;
|
||||
return string(buf);
|
||||
}
|
||||
|
||||
// Returns a vector of "interesting" UTF8 characters.
|
||||
// Unicode is now too big to just return all of them,
|
||||
// so UTF8Characters return a set likely to be good test cases.
|
||||
static const vector<string>& InterestingUTF8() {
|
||||
static bool init;
|
||||
static vector<string> v;
|
||||
|
||||
if (init)
|
||||
return v;
|
||||
|
||||
init = true;
|
||||
// All the Latin1 equivalents are interesting.
|
||||
for (int i = 1; i < 256; i++)
|
||||
v.push_back(UTF8(i));
|
||||
|
||||
// After that, the codes near bit boundaries are
|
||||
// interesting, because they span byte sequence lengths.
|
||||
for (int j = 0; j < 8; j++)
|
||||
v.push_back(UTF8(256 + j));
|
||||
for (int i = 512; i < Runemax; i <<= 1)
|
||||
for (int j = -8; j < 8; j++)
|
||||
v.push_back(UTF8(i + j));
|
||||
|
||||
// The codes near Runemax, including Runemax itself, are interesting.
|
||||
for (int j = -8; j <= 0; j++)
|
||||
v.push_back(UTF8(Runemax + j));
|
||||
|
||||
return v;
|
||||
}
|
||||
|
||||
// Test interesting UTF-8 characters against character classes.
|
||||
TEST(InterestingUTF8, SingleOps) {
|
||||
vector<string> atoms = Split(" ",
|
||||
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
|
||||
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
|
||||
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
|
||||
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
|
||||
vector<string> ops; // no ops
|
||||
ExhaustiveTest(1, 0, atoms, ops,
|
||||
1, InterestingUTF8(), "", "");
|
||||
}
|
||||
|
||||
// Test interesting UTF-8 characters against character classes,
|
||||
// but wrap everything inside AB.
|
||||
TEST(InterestingUTF8, AB) {
|
||||
vector<string> atoms = Split(" ",
|
||||
". ^ $ \\a \\f \\n \\r \\t \\v \\d \\D \\s \\S \\w \\W \\b \\B "
|
||||
"[[:alnum:]] [[:alpha:]] [[:blank:]] [[:cntrl:]] [[:digit:]] "
|
||||
"[[:graph:]] [[:lower:]] [[:print:]] [[:punct:]] [[:space:]] "
|
||||
"[[:upper:]] [[:xdigit:]] [\\s\\S] [\\d\\D] [^\\w\\W] [^\\d\\D]");
|
||||
vector<string> ops; // no ops
|
||||
vector<string> alpha = InterestingUTF8();
|
||||
for (size_t i = 0; i < alpha.size(); i++)
|
||||
alpha[i] = "a" + alpha[i] + "b";
|
||||
ExhaustiveTest(1, 0, atoms, ops,
|
||||
1, alpha, "a%sb", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
38
third_party/re2/re2/testing/exhaustive_test.cc
vendored
38
third_party/re2/re2/testing/exhaustive_test.cc
vendored
@ -1,38 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
DECLARE_string(regexp_engines);
|
||||
|
||||
// Test very simple expressions.
|
||||
TEST(EgrepLiterals, Lowercase) {
|
||||
EgrepTest(3, 2, "abc.", 3, "abc", "");
|
||||
}
|
||||
|
||||
// Test mixed-case expressions.
|
||||
TEST(EgrepLiterals, MixedCase) {
|
||||
EgrepTest(3, 2, "AaBb.", 2, "AaBb", "");
|
||||
}
|
||||
|
||||
// Test mixed-case in case-insensitive mode.
|
||||
TEST(EgrepLiterals, FoldCase) {
|
||||
// The punctuation characters surround A-Z and a-z
|
||||
// in the ASCII table. This looks for bugs in the
|
||||
// bytemap range code in the DFA.
|
||||
EgrepTest(3, 2, "abAB.", 2, "aBc@_~", "(?i:%s)");
|
||||
}
|
||||
|
||||
// Test very simple expressions.
|
||||
TEST(EgrepLiterals, UTF8) {
|
||||
EgrepTest(3, 2, "ab.", 4, "a\xE2\x98\xBA", "");
|
||||
}
|
||||
|
||||
} // namespace re2
|
||||
|
188
third_party/re2/re2/testing/exhaustive_tester.cc
vendored
188
third_party/re2/re2/testing/exhaustive_tester.cc
vendored
@ -1,188 +0,0 @@
|
||||
// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// Exhaustive testing of regular expression matching.
|
||||
|
||||
// Each test picks an alphabet (e.g., "abc"), a maximum string length,
|
||||
// a maximum regular expression length, and a maximum number of letters
|
||||
// that can appear in the regular expression. Given these parameters,
|
||||
// it tries every possible regular expression and string, verifying that
|
||||
// the NFA, DFA, and a trivial backtracking implementation agree about
|
||||
// the location of the match.
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifndef LOGGING
|
||||
#define LOGGING 0
|
||||
#endif
|
||||
|
||||
#include "util/test.h"
|
||||
#include "re2/testing/exhaustive_tester.h"
|
||||
#include "re2/testing/tester.h"
|
||||
|
||||
DEFINE_bool(show_regexps, false, "show regexps during testing");
|
||||
|
||||
DEFINE_int32(max_bad_regexp_inputs, 1,
|
||||
"Stop testing a regular expression after finding this many "
|
||||
"strings that break it.");
|
||||
|
||||
// Compiled in debug mode, the usual tests run for over an hour.
|
||||
// Have to cut it down to make the unit test machines happy.
|
||||
DEFINE_bool(quick_debug_mode, true, "Run fewer tests in debug mode.");
|
||||
|
||||
namespace re2 {
|
||||
|
||||
static char* escape(const StringPiece& sp) {
|
||||
static char buf[512];
|
||||
char* p = buf;
|
||||
*p++ = '\"';
|
||||
for (int i = 0; i < sp.size(); i++) {
|
||||
if(p+5 >= buf+sizeof buf)
|
||||
LOG(FATAL) << "ExhaustiveTester escape: too long";
|
||||
if(sp[i] == '\\' || sp[i] == '\"') {
|
||||
*p++ = '\\';
|
||||
*p++ = sp[i];
|
||||
} else if(sp[i] == '\n') {
|
||||
*p++ = '\\';
|
||||
*p++ = 'n';
|
||||
} else {
|
||||
*p++ = sp[i];
|
||||
}
|
||||
}
|
||||
*p++ = '\"';
|
||||
*p = '\0';
|
||||
return buf;
|
||||
}
|
||||
|
||||
static void PrintResult(const RE2& re, const StringPiece& input, RE2::Anchor anchor, StringPiece *m, int n) {
|
||||
if (!re.Match(input, 0, input.size(), anchor, m, n)) {
|
||||
printf("-");
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < n; i++) {
|
||||
if (i > 0)
|
||||
printf(" ");
|
||||
if (m[i].begin() == NULL)
|
||||
printf("-");
|
||||
else
|
||||
printf("%d-%d", static_cast<int>(m[i].begin() - input.begin()), static_cast<int>(m[i].end() - input.begin()));
|
||||
}
|
||||
}
|
||||
|
||||
// Processes a single generated regexp.
|
||||
// Compiles it using Regexp interface and PCRE, and then
|
||||
// checks that NFA, DFA, and PCRE all return the same results.
|
||||
void ExhaustiveTester::HandleRegexp(const string& const_regexp) {
|
||||
regexps_++;
|
||||
string regexp = const_regexp;
|
||||
if (!topwrapper_.empty())
|
||||
regexp = StringPrintf(topwrapper_.c_str(), regexp.c_str());
|
||||
|
||||
if (FLAGS_show_regexps) {
|
||||
printf("\r%s", regexp.c_str());
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
if (LOGGING) {
|
||||
// Write out test cases and answers for use in testing
|
||||
// other implementations, such as Go's regexp package.
|
||||
if (randomstrings_)
|
||||
LOG(ERROR) << "Cannot log with random strings.";
|
||||
if (regexps_ == 1) { // first
|
||||
printf("strings\n");
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext())
|
||||
printf("%s\n", escape(strgen_.Next()));
|
||||
printf("regexps\n");
|
||||
}
|
||||
printf("%s\n", escape(regexp));
|
||||
|
||||
RE2 re(regexp);
|
||||
RE2::Options longest;
|
||||
longest.set_longest_match(true);
|
||||
RE2 relongest(regexp, longest);
|
||||
int ngroup = re.NumberOfCapturingGroups()+1;
|
||||
StringPiece* group = new StringPiece[ngroup];
|
||||
|
||||
strgen_.Reset();
|
||||
while (strgen_.HasNext()) {
|
||||
StringPiece input = strgen_.Next();
|
||||
PrintResult(re, input, RE2::ANCHOR_BOTH, group, ngroup);
|
||||
printf(";");
|
||||
PrintResult(re, input, RE2::UNANCHORED, group, ngroup);
|
||||
printf(";");
|
||||
PrintResult(relongest, input, RE2::ANCHOR_BOTH, group, ngroup);
|
||||
printf(";");
|
||||
PrintResult(relongest, input, RE2::UNANCHORED, group, ngroup);
|
||||
printf("\n");
|
||||
}
|
||||
delete[] group;
|
||||
return;
|
||||
}
|
||||
|
||||
Tester tester(regexp);
|
||||
if (tester.error())
|
||||
return;
|
||||
|
||||
strgen_.Reset();
|
||||
strgen_.GenerateNULL();
|
||||
if (randomstrings_)
|
||||
strgen_.Random(stringseed_, stringcount_);
|
||||
int bad_inputs = 0;
|
||||
while (strgen_.HasNext()) {
|
||||
tests_++;
|
||||
if (!tester.TestInput(strgen_.Next())) {
|
||||
failures_++;
|
||||
if (++bad_inputs >= FLAGS_max_bad_regexp_inputs)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Runs an exhaustive test on the given parameters.
|
||||
void ExhaustiveTest(int maxatoms, int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen, const vector<string>& stralphabet,
|
||||
const string& wrapper,
|
||||
const string& topwrapper) {
|
||||
if (RE2_DEBUG_MODE && FLAGS_quick_debug_mode) {
|
||||
if (maxatoms > 1)
|
||||
maxatoms--;
|
||||
if (maxops > 1)
|
||||
maxops--;
|
||||
if (maxstrlen > 1)
|
||||
maxstrlen--;
|
||||
}
|
||||
ExhaustiveTester t(maxatoms, maxops, alphabet, ops,
|
||||
maxstrlen, stralphabet, wrapper,
|
||||
topwrapper);
|
||||
t.Generate();
|
||||
if (!LOGGING) {
|
||||
printf("%d regexps, %d tests, %d failures [%d/%d str]\n",
|
||||
t.regexps(), t.tests(), t.failures(), maxstrlen, (int)stralphabet.size());
|
||||
}
|
||||
EXPECT_EQ(0, t.failures());
|
||||
}
|
||||
|
||||
// Runs an exhaustive test using the given parameters and
|
||||
// the basic egrep operators.
|
||||
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
|
||||
int maxstrlen, const string& stralphabet,
|
||||
const string& wrapper) {
|
||||
const char* tops[] = { "", "^(?:%s)", "(?:%s)$", "^(?:%s)$" };
|
||||
|
||||
for (int i = 0; i < arraysize(tops); i++) {
|
||||
ExhaustiveTest(maxatoms, maxops,
|
||||
Split("", alphabet),
|
||||
RegexpGenerator::EgrepOps(),
|
||||
maxstrlen,
|
||||
Split("", stralphabet),
|
||||
wrapper,
|
||||
tops[i]);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace re2
|
95
third_party/re2/re2/testing/exhaustive_tester.h
vendored
95
third_party/re2/re2/testing/exhaustive_tester.h
vendored
@ -1,95 +0,0 @@
|
||||
// Copyright 2009 The RE2 Authors. All Rights Reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
#ifndef RE2_TESTING_EXHAUSTIVE_TESTER_H__
|
||||
#define RE2_TESTING_EXHAUSTIVE_TESTER_H__
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "util/util.h"
|
||||
#include "re2/testing/regexp_generator.h"
|
||||
#include "re2/testing/string_generator.h"
|
||||
|
||||
namespace re2 {
|
||||
|
||||
#if !defined(NDEBUG)
|
||||
// We are in a debug build.
|
||||
const bool RE2_DEBUG_MODE = true;
|
||||
#elif ADDRESS_SANITIZER || MEMORY_SANITIZER || THREAD_SANITIZER
|
||||
// Not a debug build, but still under sanitizers.
|
||||
const bool RE2_DEBUG_MODE = true;
|
||||
#else
|
||||
const bool RE2_DEBUG_MODE = false;
|
||||
#endif
|
||||
|
||||
// Exhaustive regular expression test: generate all regexps within parameters,
|
||||
// then generate all strings of a given length over a given alphabet,
|
||||
// then check that NFA, DFA, and PCRE agree about whether each regexp matches
|
||||
// each possible string, and if so, where the match is.
|
||||
//
|
||||
// Can also be used in a "random" mode that generates a given number
|
||||
// of random regexp and strings, allowing testing of larger expressions
|
||||
// and inputs.
|
||||
class ExhaustiveTester : public RegexpGenerator {
|
||||
public:
|
||||
ExhaustiveTester(int maxatoms,
|
||||
int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen,
|
||||
const vector<string>& stralphabet,
|
||||
const string& wrapper,
|
||||
const string& topwrapper)
|
||||
: RegexpGenerator(maxatoms, maxops, alphabet, ops),
|
||||
strgen_(maxstrlen, stralphabet),
|
||||
wrapper_(wrapper),
|
||||
topwrapper_(topwrapper),
|
||||
regexps_(0), tests_(0), failures_(0),
|
||||
randomstrings_(0), stringseed_(0), stringcount_(0) { }
|
||||
|
||||
int regexps() { return regexps_; }
|
||||
int tests() { return tests_; }
|
||||
int failures() { return failures_; }
|
||||
|
||||
// Needed for RegexpGenerator interface.
|
||||
void HandleRegexp(const string& regexp);
|
||||
|
||||
// Causes testing to generate random input strings.
|
||||
void RandomStrings(int32 seed, int32 count) {
|
||||
randomstrings_ = true;
|
||||
stringseed_ = seed;
|
||||
stringcount_ = count;
|
||||
}
|
||||
|
||||
private:
|
||||
StringGenerator strgen_;
|
||||
string wrapper_; // Regexp wrapper - either empty or has one %s.
|
||||
string topwrapper_; // Regexp top-level wrapper.
|
||||
int regexps_; // Number of HandleRegexp calls
|
||||
int tests_; // Number of regexp tests.
|
||||
int failures_; // Number of tests failed.
|
||||
|
||||
bool randomstrings_; // Whether to use random strings
|
||||
int32 stringseed_; // If so, the seed.
|
||||
int stringcount_; // If so, how many to generate.
|
||||
DISALLOW_COPY_AND_ASSIGN(ExhaustiveTester);
|
||||
};
|
||||
|
||||
// Runs an exhaustive test on the given parameters.
|
||||
void ExhaustiveTest(int maxatoms, int maxops,
|
||||
const vector<string>& alphabet,
|
||||
const vector<string>& ops,
|
||||
int maxstrlen, const vector<string>& stralphabet,
|
||||
const string& wrapper,
|
||||
const string& topwrapper);
|
||||
|
||||
// Runs an exhaustive test using the given parameters and
|
||||
// the basic egrep operators.
|
||||
void EgrepTest(int maxatoms, int maxops, const string& alphabet,
|
||||
int maxstrlen, const string& stralphabet,
|
||||
const string& wrapper);
|
||||
|
||||
} // namespace re2
|
||||
|
||||
#endif // RE2_TESTING_EXHAUSTIVE_TESTER_H__
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user