[legacymetrics] Reconnect to FIDL automatically
Implements automatic FIDL reconnection with exponential backoff. If the client loses connection to the MetricsRecorder FIDL service, it will attempt to reconnect after 1 second. Consecutive disconnect/reconnect cycles without a successful Record() FIDL call will wait for twice as long, up to a maximum of 1 hour. Also continues to record user action events even during service disconnect, up to a maximum of 5000 events before dropping new events. Bug: b/170876057 Test: cr_fuchsia_base_unittests Change-Id: Ica2ed85f619834d6a50b546420e7acea86a9549c Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2547793 Commit-Queue: Akira Baruah <akirabaruah@google.com> Reviewed-by: Kevin Marshall <kmarshall@chromium.org> Cr-Commit-Position: refs/heads/master@{#835942}
This commit is contained in:

committed by
Chromium LUCI CQ

parent
77851b97f1
commit
7b75418bae
@ -133,6 +133,7 @@ test("cr_fuchsia_base_unittests") {
|
||||
"//base/test:run_all_unittests",
|
||||
"//base/test:test_support",
|
||||
"//components/version_info",
|
||||
"//testing/gmock",
|
||||
"//testing/gtest",
|
||||
"//third_party/fuchsia-sdk/sdk/pkg/sys_cpp",
|
||||
"//third_party/fuchsia-sdk/sdk/pkg/sys_inspect_cpp",
|
||||
|
@ -6,6 +6,8 @@
|
||||
|
||||
#include <lib/fit/function.h>
|
||||
#include <lib/sys/cpp/component_context.h>
|
||||
#include <zircon/errors.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
@ -22,6 +24,10 @@ namespace cr_fuchsia {
|
||||
|
||||
constexpr size_t LegacyMetricsClient::kMaxBatchSize;
|
||||
|
||||
constexpr base::TimeDelta LegacyMetricsClient::kInitialReconnectDelay;
|
||||
constexpr base::TimeDelta LegacyMetricsClient::kMaxReconnectDelay;
|
||||
constexpr size_t LegacyMetricsClient::kReconnectBackoffFactor;
|
||||
|
||||
LegacyMetricsClient::LegacyMetricsClient() = default;
|
||||
|
||||
LegacyMetricsClient::~LegacyMetricsClient() {
|
||||
@ -31,18 +37,12 @@ LegacyMetricsClient::~LegacyMetricsClient() {
|
||||
void LegacyMetricsClient::Start(base::TimeDelta report_interval) {
|
||||
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
|
||||
DCHECK_GT(report_interval, base::TimeDelta::FromSeconds(0));
|
||||
DCHECK(!metrics_recorder_) << "Start() called more than once.";
|
||||
|
||||
// Start recording user events.
|
||||
user_events_recorder_ = std::make_unique<LegacyMetricsUserActionRecorder>();
|
||||
|
||||
report_interval_ = report_interval;
|
||||
metrics_recorder_ = base::ComponentContextForProcess()
|
||||
->svc()
|
||||
->Connect<fuchsia::legacymetrics::MetricsRecorder>();
|
||||
metrics_recorder_.set_error_handler(fit::bind_member(
|
||||
this, &LegacyMetricsClient::OnMetricsRecorderDisconnected));
|
||||
metrics_recorder_.events().OnCloseSoon =
|
||||
fit::bind_member(this, &LegacyMetricsClient::OnCloseSoon);
|
||||
user_events_recorder_ = std::make_unique<LegacyMetricsUserActionRecorder>();
|
||||
ScheduleNextReport();
|
||||
ConnectAndStartReporting();
|
||||
}
|
||||
|
||||
void LegacyMetricsClient::SetReportAdditionalMetricsCallback(
|
||||
@ -65,13 +65,26 @@ void LegacyMetricsClient::SetNotifyFlushCallback(NotifyFlushCallback callback) {
|
||||
notify_flush_callback_ = std::move(callback);
|
||||
}
|
||||
|
||||
void LegacyMetricsClient::ConnectAndStartReporting() {
|
||||
DCHECK(!metrics_recorder_) << "Trying to connect when already connected.";
|
||||
DVLOG(1) << "Trying to connect to MetricsRecorder service.";
|
||||
metrics_recorder_ = base::ComponentContextForProcess()
|
||||
->svc()
|
||||
->Connect<fuchsia::legacymetrics::MetricsRecorder>();
|
||||
metrics_recorder_.set_error_handler(fit::bind_member(
|
||||
this, &LegacyMetricsClient::OnMetricsRecorderDisconnected));
|
||||
metrics_recorder_.events().OnCloseSoon =
|
||||
fit::bind_member(this, &LegacyMetricsClient::OnCloseSoon);
|
||||
ScheduleNextReport();
|
||||
}
|
||||
|
||||
void LegacyMetricsClient::ScheduleNextReport() {
|
||||
DCHECK(!is_flushing_);
|
||||
|
||||
DVLOG(1) << "Scheduling next report in " << report_interval_.InSeconds()
|
||||
<< "seconds.";
|
||||
timer_.Start(FROM_HERE, report_interval_, this,
|
||||
&LegacyMetricsClient::StartReport);
|
||||
report_timer_.Start(FROM_HERE, report_interval_, this,
|
||||
&LegacyMetricsClient::StartReport);
|
||||
}
|
||||
|
||||
void LegacyMetricsClient::StartReport() {
|
||||
@ -88,7 +101,8 @@ void LegacyMetricsClient::Report(
|
||||
DVLOG(1) << __func__ << " called.";
|
||||
|
||||
// The connection might have dropped while additional metrics were being
|
||||
// collected.
|
||||
// collected. Continue recording events and cache them locally in memory until
|
||||
// connection is reestablished.
|
||||
if (!metrics_recorder_)
|
||||
return;
|
||||
|
||||
@ -147,6 +161,10 @@ void LegacyMetricsClient::DrainBuffer() {
|
||||
record_ack_pending_ = true;
|
||||
metrics_recorder_->Record(std::move(batch), [this]() {
|
||||
record_ack_pending_ = false;
|
||||
|
||||
// Reset the reconnect delay after a successful Record() call.
|
||||
reconnect_delay_ = kInitialReconnectDelay;
|
||||
|
||||
DrainBuffer();
|
||||
});
|
||||
}
|
||||
@ -154,9 +172,21 @@ void LegacyMetricsClient::DrainBuffer() {
|
||||
void LegacyMetricsClient::OnMetricsRecorderDisconnected(zx_status_t status) {
|
||||
ZX_LOG(ERROR, status) << "MetricsRecorder connection lost.";
|
||||
|
||||
// Stop recording & reporting user events.
|
||||
user_events_recorder_.reset();
|
||||
timer_.AbandonAndStop();
|
||||
// Stop reporting metric events.
|
||||
report_timer_.AbandonAndStop();
|
||||
|
||||
if (status == ZX_ERR_PEER_CLOSED) {
|
||||
DVLOG(1) << "Scheduling reconnect after " << reconnect_delay_;
|
||||
|
||||
// Try to reconnect with exponential backoff.
|
||||
reconnect_timer_.Start(FROM_HERE, reconnect_delay_, this,
|
||||
&LegacyMetricsClient::ConnectAndStartReporting);
|
||||
|
||||
// Increase delay exponentially. No random jittering since we don't expect
|
||||
// many clients overloading the service with simultaneous reconnections.
|
||||
reconnect_delay_ = std::min(reconnect_delay_ * kReconnectBackoffFactor,
|
||||
kMaxReconnectDelay);
|
||||
}
|
||||
}
|
||||
|
||||
void LegacyMetricsClient::FlushAndDisconnect(
|
||||
@ -167,7 +197,7 @@ void LegacyMetricsClient::FlushAndDisconnect(
|
||||
return;
|
||||
|
||||
on_flush_complete_ = std::move(on_flush_complete);
|
||||
timer_.AbandonAndStop();
|
||||
report_timer_.AbandonAndStop();
|
||||
|
||||
is_flushing_ = true;
|
||||
if (notify_flush_callback_) {
|
||||
|
@ -13,6 +13,7 @@
|
||||
#include "base/callback.h"
|
||||
#include "base/memory/weak_ptr.h"
|
||||
#include "base/sequence_checker.h"
|
||||
#include "base/time/time.h"
|
||||
#include "base/timer/timer.h"
|
||||
#include "fuchsia/base/legacymetrics_user_event_recorder.h"
|
||||
|
||||
@ -29,6 +30,13 @@ class LegacyMetricsClient {
|
||||
// the 64KB FIDL maximum message size.
|
||||
static constexpr size_t kMaxBatchSize = 50;
|
||||
|
||||
// Constants for FIDL reconnection with exponential backoff.
|
||||
static constexpr base::TimeDelta kInitialReconnectDelay =
|
||||
base::TimeDelta::FromSeconds(1);
|
||||
static constexpr base::TimeDelta kMaxReconnectDelay =
|
||||
base::TimeDelta::FromHours(1);
|
||||
static constexpr size_t kReconnectBackoffFactor = 2;
|
||||
|
||||
using ReportAdditionalMetricsCallback = base::RepeatingCallback<void(
|
||||
base::OnceCallback<void(std::vector<fuchsia::legacymetrics::Event>)>)>;
|
||||
using NotifyFlushCallback =
|
||||
@ -63,6 +71,7 @@ class LegacyMetricsClient {
|
||||
void FlushAndDisconnect(base::OnceClosure on_flush_complete);
|
||||
|
||||
private:
|
||||
void ConnectAndStartReporting();
|
||||
void ScheduleNextReport();
|
||||
void StartReport();
|
||||
void Report(std::vector<fuchsia::legacymetrics::Event> additional_metrics);
|
||||
@ -72,6 +81,7 @@ class LegacyMetricsClient {
|
||||
// Incrementally sends the contents of |to_send_| to |metrics_recorder_|.
|
||||
void DrainBuffer();
|
||||
|
||||
base::TimeDelta reconnect_delay_ = kInitialReconnectDelay;
|
||||
base::TimeDelta report_interval_;
|
||||
ReportAdditionalMetricsCallback report_additional_callback_;
|
||||
NotifyFlushCallback notify_flush_callback_;
|
||||
@ -81,7 +91,8 @@ class LegacyMetricsClient {
|
||||
std::unique_ptr<LegacyMetricsUserActionRecorder> user_events_recorder_;
|
||||
|
||||
fuchsia::legacymetrics::MetricsRecorderPtr metrics_recorder_;
|
||||
base::RetainingOneShotTimer timer_;
|
||||
base::RetainingOneShotTimer reconnect_timer_;
|
||||
base::RetainingOneShotTimer report_timer_;
|
||||
SEQUENCE_CHECKER(sequence_checker_);
|
||||
|
||||
base::OnceClosure on_flush_complete_;
|
||||
|
@ -4,6 +4,7 @@
|
||||
|
||||
#include <fuchsia/legacymetrics/cpp/fidl.h>
|
||||
#include <fuchsia/legacymetrics/cpp/fidl_test_base.h>
|
||||
#include <cmath>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
@ -13,15 +14,21 @@
|
||||
#include "base/test/bind.h"
|
||||
#include "base/test/task_environment.h"
|
||||
#include "base/threading/thread_task_runner_handle.h"
|
||||
#include "base/time/time.h"
|
||||
#include "fuchsia/base/legacymetrics_client.h"
|
||||
#include "fuchsia/base/legacymetrics_histogram_flattener.h"
|
||||
#include "fuchsia/base/result_receiver.h"
|
||||
#include "testing/gmock/include/gmock/gmock.h"
|
||||
#include "testing/gtest/include/gtest/gtest.h"
|
||||
|
||||
namespace cr_fuchsia {
|
||||
namespace {
|
||||
|
||||
using ::testing::Property;
|
||||
using ::testing::UnorderedElementsAreArray;
|
||||
|
||||
constexpr base::TimeDelta kReportInterval = base::TimeDelta::FromMinutes(1);
|
||||
constexpr base::TimeDelta kShortDuration = base::TimeDelta::FromSeconds(1);
|
||||
|
||||
class TestMetricsRecorder
|
||||
: public fuchsia::legacymetrics::testing::MetricsRecorder_TestBase {
|
||||
@ -85,16 +92,46 @@ class LegacyMetricsClientTest : public testing::Test {
|
||||
~LegacyMetricsClientTest() override = default;
|
||||
|
||||
void SetUp() override {
|
||||
service_binding_ =
|
||||
std::make_unique<base::fuchsia::ScopedSingleClientServiceBinding<
|
||||
fuchsia::legacymetrics::MetricsRecorder>>(
|
||||
test_context_.additional_services(), &test_recorder_);
|
||||
service_binding_ = MakeServiceBinding();
|
||||
base::SetRecordActionTaskRunner(base::ThreadTaskRunnerHandle::Get());
|
||||
|
||||
// Flush any dirty histograms from previous test runs in this process.
|
||||
GetLegacyMetricsDeltas();
|
||||
}
|
||||
|
||||
std::unique_ptr<base::fuchsia::ScopedSingleClientServiceBinding<
|
||||
fuchsia::legacymetrics::MetricsRecorder>>
|
||||
MakeServiceBinding() {
|
||||
return std::make_unique<base::fuchsia::ScopedSingleClientServiceBinding<
|
||||
fuchsia::legacymetrics::MetricsRecorder>>(
|
||||
test_context_.additional_services(), &test_recorder_);
|
||||
}
|
||||
|
||||
void StartClientAndExpectConnection() {
|
||||
client_.Start(kReportInterval);
|
||||
base::RunLoop().RunUntilIdle();
|
||||
EXPECT_TRUE(service_binding_->has_clients());
|
||||
}
|
||||
|
||||
// Disconnects the service side of the metrics FIDL channel and replaces the
|
||||
// binding with a new instance.
|
||||
void DisconnectAndRestartMetricsService() {
|
||||
service_binding_.reset();
|
||||
service_binding_ = MakeServiceBinding();
|
||||
base::RunLoop().RunUntilIdle();
|
||||
}
|
||||
|
||||
void ExpectReconnectAfterDelay(const base::TimeDelta& delay) {
|
||||
// Just before the expected delay, the client shouldn't reconnect yet.
|
||||
task_environment_.FastForwardBy(delay - kShortDuration);
|
||||
EXPECT_FALSE(service_binding_->has_clients())
|
||||
<< "Expected delay: " << delay;
|
||||
|
||||
// Complete the full expected reconnect delay. Client should reconnect.
|
||||
task_environment_.FastForwardBy(kShortDuration);
|
||||
EXPECT_TRUE(service_binding_->has_clients()) << "Expected delay: " << delay;
|
||||
}
|
||||
|
||||
protected:
|
||||
base::test::TaskEnvironment task_environment_;
|
||||
base::TestComponentContextForProcess test_context_;
|
||||
@ -221,10 +258,101 @@ TEST_F(LegacyMetricsClientTest, NoReportIfNeverAcked) {
|
||||
EXPECT_FALSE(test_recorder_.IsRecordInFlight());
|
||||
}
|
||||
|
||||
TEST_F(LegacyMetricsClientTest, MetricsChannelDisconnected) {
|
||||
client_.Start(kReportInterval);
|
||||
service_binding_.reset();
|
||||
TEST_F(LegacyMetricsClientTest, ReconnectAfterServiceDisconnect) {
|
||||
StartClientAndExpectConnection();
|
||||
DisconnectAndRestartMetricsService();
|
||||
EXPECT_FALSE(service_binding_->has_clients());
|
||||
task_environment_.FastForwardBy(LegacyMetricsClient::kInitialReconnectDelay);
|
||||
EXPECT_TRUE(service_binding_->has_clients());
|
||||
}
|
||||
|
||||
TEST_F(LegacyMetricsClientTest,
|
||||
ReconnectConsecutivelyWithoutRecordBacksOffExponentially) {
|
||||
StartClientAndExpectConnection();
|
||||
|
||||
for (base::TimeDelta expected_delay =
|
||||
LegacyMetricsClient::kInitialReconnectDelay;
|
||||
expected_delay <= LegacyMetricsClient::kMaxReconnectDelay;
|
||||
expected_delay *= LegacyMetricsClient::kReconnectBackoffFactor) {
|
||||
DisconnectAndRestartMetricsService();
|
||||
ExpectReconnectAfterDelay(expected_delay);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(LegacyMetricsClientTest, ReconnectDelayNeverExceedsMax) {
|
||||
StartClientAndExpectConnection();
|
||||
|
||||
// Find the theoretical maximum number of consecutive failed connections. Also
|
||||
// add a few extra iterations to ensure that we never exceed the max delay.
|
||||
const size_t num_iterations =
|
||||
3 + log(LegacyMetricsClient::kMaxReconnectDelay /
|
||||
LegacyMetricsClient::kInitialReconnectDelay) /
|
||||
log(LegacyMetricsClient::kReconnectBackoffFactor);
|
||||
|
||||
// As a heuristic, starting with 1 second and a factor of 2 reaches 24 hours
|
||||
// in about 17 iterations. So the expected number of iterations needed to
|
||||
// reach the maximum delay should be less than about 20.
|
||||
EXPECT_LE(num_iterations, 20u);
|
||||
|
||||
for (size_t i = 0; i < num_iterations; i++) {
|
||||
DisconnectAndRestartMetricsService();
|
||||
EXPECT_FALSE(service_binding_->has_clients()) << "Iteration " << i;
|
||||
task_environment_.FastForwardBy(LegacyMetricsClient::kMaxReconnectDelay);
|
||||
EXPECT_TRUE(service_binding_->has_clients()) << "Iteration " << i;
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(LegacyMetricsClientTest, RecordCompletionResetsReconnectDelay) {
|
||||
StartClientAndExpectConnection();
|
||||
|
||||
// First reconnect has initial delay.
|
||||
DisconnectAndRestartMetricsService();
|
||||
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay);
|
||||
|
||||
// Another reconnect without a successful Record() call increases the delay.
|
||||
DisconnectAndRestartMetricsService();
|
||||
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay *
|
||||
LegacyMetricsClient::kReconnectBackoffFactor);
|
||||
|
||||
// Record and report an event, invoking a FIDL Record().
|
||||
base::RecordComputedAction("ArbitraryEvent");
|
||||
task_environment_.FastForwardBy(kReportInterval);
|
||||
EXPECT_TRUE(test_recorder_.IsRecordInFlight());
|
||||
test_recorder_.SendAck();
|
||||
base::RunLoop().RunUntilIdle();
|
||||
|
||||
// Reconnect after a successful Record() uses the initial delay again.
|
||||
DisconnectAndRestartMetricsService();
|
||||
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay);
|
||||
}
|
||||
|
||||
TEST_F(LegacyMetricsClientTest, ContinueRecordingUserActionsAfterDisconnect) {
|
||||
StartClientAndExpectConnection();
|
||||
|
||||
base::RecordComputedAction("BeforeDisconnect");
|
||||
DisconnectAndRestartMetricsService();
|
||||
base::RecordComputedAction("DuringDisconnect");
|
||||
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay);
|
||||
base::RecordComputedAction("AfterReconnect");
|
||||
|
||||
// Fast forward to report metrics.
|
||||
task_environment_.FastForwardBy(kReportInterval);
|
||||
EXPECT_TRUE(test_recorder_.IsRecordInFlight());
|
||||
|
||||
auto events = test_recorder_.WaitForEvents();
|
||||
EXPECT_THAT(
|
||||
events,
|
||||
UnorderedElementsAreArray({
|
||||
Property(&fuchsia::legacymetrics::Event::user_action_event,
|
||||
Property(&fuchsia::legacymetrics::UserActionEvent::name,
|
||||
"BeforeDisconnect")),
|
||||
Property(&fuchsia::legacymetrics::Event::user_action_event,
|
||||
Property(&fuchsia::legacymetrics::UserActionEvent::name,
|
||||
"DuringDisconnect")),
|
||||
Property(&fuchsia::legacymetrics::Event::user_action_event,
|
||||
Property(&fuchsia::legacymetrics::UserActionEvent::name,
|
||||
"AfterReconnect")),
|
||||
}));
|
||||
}
|
||||
|
||||
TEST_F(LegacyMetricsClientTest, Batching) {
|
||||
|
@ -11,6 +11,8 @@
|
||||
|
||||
namespace cr_fuchsia {
|
||||
|
||||
constexpr size_t LegacyMetricsUserActionRecorder::kMaxEventCount;
|
||||
|
||||
LegacyMetricsUserActionRecorder::LegacyMetricsUserActionRecorder()
|
||||
: on_event_callback_(
|
||||
base::BindRepeating(&LegacyMetricsUserActionRecorder::OnUserAction,
|
||||
@ -33,6 +35,9 @@ LegacyMetricsUserActionRecorder::TakeEvents() {
|
||||
|
||||
void LegacyMetricsUserActionRecorder::OnUserAction(const std::string& action,
|
||||
base::TimeTicks time) {
|
||||
if (events_.size() >= kMaxEventCount)
|
||||
return;
|
||||
|
||||
fuchsia::legacymetrics::UserActionEvent fidl_event;
|
||||
fidl_event.set_name(action);
|
||||
fidl_event.set_time(time.ToZxTime());
|
||||
|
@ -17,6 +17,9 @@ namespace cr_fuchsia {
|
||||
// fuchsia.legacymetrics equivalent.
|
||||
class LegacyMetricsUserActionRecorder {
|
||||
public:
|
||||
// Maximum number of Events to store locally before dropping new ones.
|
||||
static constexpr size_t kMaxEventCount = 5000;
|
||||
|
||||
LegacyMetricsUserActionRecorder();
|
||||
~LegacyMetricsUserActionRecorder();
|
||||
|
||||
|
@ -72,5 +72,19 @@ TEST_F(LegacyMetricsUserActionRecorderTest, EmptyBuffer) {
|
||||
EXPECT_FALSE(buffer.HasEvents());
|
||||
}
|
||||
|
||||
TEST_F(LegacyMetricsUserActionRecorderTest, EnforcesMaximumEventCount) {
|
||||
LegacyMetricsUserActionRecorder buffer;
|
||||
|
||||
// Try recording twice the maximum number of locally stored events.
|
||||
for (int i = 0, count = LegacyMetricsUserActionRecorder::kMaxEventCount * 2;
|
||||
i < count; i++) {
|
||||
base::RecordComputedAction("Test");
|
||||
}
|
||||
|
||||
EXPECT_TRUE(buffer.HasEvents());
|
||||
auto events = buffer.TakeEvents();
|
||||
EXPECT_EQ(LegacyMetricsUserActionRecorder::kMaxEventCount, events.size());
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace cr_fuchsia
|
||||
|
Reference in New Issue
Block a user