0

[legacymetrics] Reconnect to FIDL automatically

Implements automatic FIDL reconnection with exponential backoff. If the
client loses connection to the MetricsRecorder FIDL service, it will
attempt to reconnect after 1 second. Consecutive disconnect/reconnect
cycles without a successful Record() FIDL call will wait for twice as
long, up to a maximum of 1 hour.

Also continues to record user action events even during service
disconnect, up to a maximum of 5000 events before dropping new events.

Bug: b/170876057
Test: cr_fuchsia_base_unittests
Change-Id: Ica2ed85f619834d6a50b546420e7acea86a9549c
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2547793
Commit-Queue: Akira Baruah <akirabaruah@google.com>
Reviewed-by: Kevin Marshall <kmarshall@chromium.org>
Cr-Commit-Position: refs/heads/master@{#835942}
This commit is contained in:
Akira Baruah
2020-12-11 01:35:09 +00:00
committed by Chromium LUCI CQ
parent 77851b97f1
commit 7b75418bae
7 changed files with 217 additions and 25 deletions

@ -133,6 +133,7 @@ test("cr_fuchsia_base_unittests") {
"//base/test:run_all_unittests",
"//base/test:test_support",
"//components/version_info",
"//testing/gmock",
"//testing/gtest",
"//third_party/fuchsia-sdk/sdk/pkg/sys_cpp",
"//third_party/fuchsia-sdk/sdk/pkg/sys_inspect_cpp",

@ -6,6 +6,8 @@
#include <lib/fit/function.h>
#include <lib/sys/cpp/component_context.h>
#include <zircon/errors.h>
#include <algorithm>
#include <memory>
#include <utility>
@ -22,6 +24,10 @@ namespace cr_fuchsia {
constexpr size_t LegacyMetricsClient::kMaxBatchSize;
constexpr base::TimeDelta LegacyMetricsClient::kInitialReconnectDelay;
constexpr base::TimeDelta LegacyMetricsClient::kMaxReconnectDelay;
constexpr size_t LegacyMetricsClient::kReconnectBackoffFactor;
LegacyMetricsClient::LegacyMetricsClient() = default;
LegacyMetricsClient::~LegacyMetricsClient() {
@ -31,18 +37,12 @@ LegacyMetricsClient::~LegacyMetricsClient() {
void LegacyMetricsClient::Start(base::TimeDelta report_interval) {
DCHECK_CALLED_ON_VALID_SEQUENCE(sequence_checker_);
DCHECK_GT(report_interval, base::TimeDelta::FromSeconds(0));
DCHECK(!metrics_recorder_) << "Start() called more than once.";
// Start recording user events.
user_events_recorder_ = std::make_unique<LegacyMetricsUserActionRecorder>();
report_interval_ = report_interval;
metrics_recorder_ = base::ComponentContextForProcess()
->svc()
->Connect<fuchsia::legacymetrics::MetricsRecorder>();
metrics_recorder_.set_error_handler(fit::bind_member(
this, &LegacyMetricsClient::OnMetricsRecorderDisconnected));
metrics_recorder_.events().OnCloseSoon =
fit::bind_member(this, &LegacyMetricsClient::OnCloseSoon);
user_events_recorder_ = std::make_unique<LegacyMetricsUserActionRecorder>();
ScheduleNextReport();
ConnectAndStartReporting();
}
void LegacyMetricsClient::SetReportAdditionalMetricsCallback(
@ -65,13 +65,26 @@ void LegacyMetricsClient::SetNotifyFlushCallback(NotifyFlushCallback callback) {
notify_flush_callback_ = std::move(callback);
}
void LegacyMetricsClient::ConnectAndStartReporting() {
DCHECK(!metrics_recorder_) << "Trying to connect when already connected.";
DVLOG(1) << "Trying to connect to MetricsRecorder service.";
metrics_recorder_ = base::ComponentContextForProcess()
->svc()
->Connect<fuchsia::legacymetrics::MetricsRecorder>();
metrics_recorder_.set_error_handler(fit::bind_member(
this, &LegacyMetricsClient::OnMetricsRecorderDisconnected));
metrics_recorder_.events().OnCloseSoon =
fit::bind_member(this, &LegacyMetricsClient::OnCloseSoon);
ScheduleNextReport();
}
void LegacyMetricsClient::ScheduleNextReport() {
DCHECK(!is_flushing_);
DVLOG(1) << "Scheduling next report in " << report_interval_.InSeconds()
<< "seconds.";
timer_.Start(FROM_HERE, report_interval_, this,
&LegacyMetricsClient::StartReport);
report_timer_.Start(FROM_HERE, report_interval_, this,
&LegacyMetricsClient::StartReport);
}
void LegacyMetricsClient::StartReport() {
@ -88,7 +101,8 @@ void LegacyMetricsClient::Report(
DVLOG(1) << __func__ << " called.";
// The connection might have dropped while additional metrics were being
// collected.
// collected. Continue recording events and cache them locally in memory until
// connection is reestablished.
if (!metrics_recorder_)
return;
@ -147,6 +161,10 @@ void LegacyMetricsClient::DrainBuffer() {
record_ack_pending_ = true;
metrics_recorder_->Record(std::move(batch), [this]() {
record_ack_pending_ = false;
// Reset the reconnect delay after a successful Record() call.
reconnect_delay_ = kInitialReconnectDelay;
DrainBuffer();
});
}
@ -154,9 +172,21 @@ void LegacyMetricsClient::DrainBuffer() {
void LegacyMetricsClient::OnMetricsRecorderDisconnected(zx_status_t status) {
ZX_LOG(ERROR, status) << "MetricsRecorder connection lost.";
// Stop recording & reporting user events.
user_events_recorder_.reset();
timer_.AbandonAndStop();
// Stop reporting metric events.
report_timer_.AbandonAndStop();
if (status == ZX_ERR_PEER_CLOSED) {
DVLOG(1) << "Scheduling reconnect after " << reconnect_delay_;
// Try to reconnect with exponential backoff.
reconnect_timer_.Start(FROM_HERE, reconnect_delay_, this,
&LegacyMetricsClient::ConnectAndStartReporting);
// Increase delay exponentially. No random jittering since we don't expect
// many clients overloading the service with simultaneous reconnections.
reconnect_delay_ = std::min(reconnect_delay_ * kReconnectBackoffFactor,
kMaxReconnectDelay);
}
}
void LegacyMetricsClient::FlushAndDisconnect(
@ -167,7 +197,7 @@ void LegacyMetricsClient::FlushAndDisconnect(
return;
on_flush_complete_ = std::move(on_flush_complete);
timer_.AbandonAndStop();
report_timer_.AbandonAndStop();
is_flushing_ = true;
if (notify_flush_callback_) {

@ -13,6 +13,7 @@
#include "base/callback.h"
#include "base/memory/weak_ptr.h"
#include "base/sequence_checker.h"
#include "base/time/time.h"
#include "base/timer/timer.h"
#include "fuchsia/base/legacymetrics_user_event_recorder.h"
@ -29,6 +30,13 @@ class LegacyMetricsClient {
// the 64KB FIDL maximum message size.
static constexpr size_t kMaxBatchSize = 50;
// Constants for FIDL reconnection with exponential backoff.
static constexpr base::TimeDelta kInitialReconnectDelay =
base::TimeDelta::FromSeconds(1);
static constexpr base::TimeDelta kMaxReconnectDelay =
base::TimeDelta::FromHours(1);
static constexpr size_t kReconnectBackoffFactor = 2;
using ReportAdditionalMetricsCallback = base::RepeatingCallback<void(
base::OnceCallback<void(std::vector<fuchsia::legacymetrics::Event>)>)>;
using NotifyFlushCallback =
@ -63,6 +71,7 @@ class LegacyMetricsClient {
void FlushAndDisconnect(base::OnceClosure on_flush_complete);
private:
void ConnectAndStartReporting();
void ScheduleNextReport();
void StartReport();
void Report(std::vector<fuchsia::legacymetrics::Event> additional_metrics);
@ -72,6 +81,7 @@ class LegacyMetricsClient {
// Incrementally sends the contents of |to_send_| to |metrics_recorder_|.
void DrainBuffer();
base::TimeDelta reconnect_delay_ = kInitialReconnectDelay;
base::TimeDelta report_interval_;
ReportAdditionalMetricsCallback report_additional_callback_;
NotifyFlushCallback notify_flush_callback_;
@ -81,7 +91,8 @@ class LegacyMetricsClient {
std::unique_ptr<LegacyMetricsUserActionRecorder> user_events_recorder_;
fuchsia::legacymetrics::MetricsRecorderPtr metrics_recorder_;
base::RetainingOneShotTimer timer_;
base::RetainingOneShotTimer reconnect_timer_;
base::RetainingOneShotTimer report_timer_;
SEQUENCE_CHECKER(sequence_checker_);
base::OnceClosure on_flush_complete_;

@ -4,6 +4,7 @@
#include <fuchsia/legacymetrics/cpp/fidl.h>
#include <fuchsia/legacymetrics/cpp/fidl_test_base.h>
#include <cmath>
#include <string>
#include <utility>
@ -13,15 +14,21 @@
#include "base/test/bind.h"
#include "base/test/task_environment.h"
#include "base/threading/thread_task_runner_handle.h"
#include "base/time/time.h"
#include "fuchsia/base/legacymetrics_client.h"
#include "fuchsia/base/legacymetrics_histogram_flattener.h"
#include "fuchsia/base/result_receiver.h"
#include "testing/gmock/include/gmock/gmock.h"
#include "testing/gtest/include/gtest/gtest.h"
namespace cr_fuchsia {
namespace {
using ::testing::Property;
using ::testing::UnorderedElementsAreArray;
constexpr base::TimeDelta kReportInterval = base::TimeDelta::FromMinutes(1);
constexpr base::TimeDelta kShortDuration = base::TimeDelta::FromSeconds(1);
class TestMetricsRecorder
: public fuchsia::legacymetrics::testing::MetricsRecorder_TestBase {
@ -85,16 +92,46 @@ class LegacyMetricsClientTest : public testing::Test {
~LegacyMetricsClientTest() override = default;
void SetUp() override {
service_binding_ =
std::make_unique<base::fuchsia::ScopedSingleClientServiceBinding<
fuchsia::legacymetrics::MetricsRecorder>>(
test_context_.additional_services(), &test_recorder_);
service_binding_ = MakeServiceBinding();
base::SetRecordActionTaskRunner(base::ThreadTaskRunnerHandle::Get());
// Flush any dirty histograms from previous test runs in this process.
GetLegacyMetricsDeltas();
}
std::unique_ptr<base::fuchsia::ScopedSingleClientServiceBinding<
fuchsia::legacymetrics::MetricsRecorder>>
MakeServiceBinding() {
return std::make_unique<base::fuchsia::ScopedSingleClientServiceBinding<
fuchsia::legacymetrics::MetricsRecorder>>(
test_context_.additional_services(), &test_recorder_);
}
void StartClientAndExpectConnection() {
client_.Start(kReportInterval);
base::RunLoop().RunUntilIdle();
EXPECT_TRUE(service_binding_->has_clients());
}
// Disconnects the service side of the metrics FIDL channel and replaces the
// binding with a new instance.
void DisconnectAndRestartMetricsService() {
service_binding_.reset();
service_binding_ = MakeServiceBinding();
base::RunLoop().RunUntilIdle();
}
void ExpectReconnectAfterDelay(const base::TimeDelta& delay) {
// Just before the expected delay, the client shouldn't reconnect yet.
task_environment_.FastForwardBy(delay - kShortDuration);
EXPECT_FALSE(service_binding_->has_clients())
<< "Expected delay: " << delay;
// Complete the full expected reconnect delay. Client should reconnect.
task_environment_.FastForwardBy(kShortDuration);
EXPECT_TRUE(service_binding_->has_clients()) << "Expected delay: " << delay;
}
protected:
base::test::TaskEnvironment task_environment_;
base::TestComponentContextForProcess test_context_;
@ -221,10 +258,101 @@ TEST_F(LegacyMetricsClientTest, NoReportIfNeverAcked) {
EXPECT_FALSE(test_recorder_.IsRecordInFlight());
}
TEST_F(LegacyMetricsClientTest, MetricsChannelDisconnected) {
client_.Start(kReportInterval);
service_binding_.reset();
TEST_F(LegacyMetricsClientTest, ReconnectAfterServiceDisconnect) {
StartClientAndExpectConnection();
DisconnectAndRestartMetricsService();
EXPECT_FALSE(service_binding_->has_clients());
task_environment_.FastForwardBy(LegacyMetricsClient::kInitialReconnectDelay);
EXPECT_TRUE(service_binding_->has_clients());
}
TEST_F(LegacyMetricsClientTest,
ReconnectConsecutivelyWithoutRecordBacksOffExponentially) {
StartClientAndExpectConnection();
for (base::TimeDelta expected_delay =
LegacyMetricsClient::kInitialReconnectDelay;
expected_delay <= LegacyMetricsClient::kMaxReconnectDelay;
expected_delay *= LegacyMetricsClient::kReconnectBackoffFactor) {
DisconnectAndRestartMetricsService();
ExpectReconnectAfterDelay(expected_delay);
}
}
TEST_F(LegacyMetricsClientTest, ReconnectDelayNeverExceedsMax) {
StartClientAndExpectConnection();
// Find the theoretical maximum number of consecutive failed connections. Also
// add a few extra iterations to ensure that we never exceed the max delay.
const size_t num_iterations =
3 + log(LegacyMetricsClient::kMaxReconnectDelay /
LegacyMetricsClient::kInitialReconnectDelay) /
log(LegacyMetricsClient::kReconnectBackoffFactor);
// As a heuristic, starting with 1 second and a factor of 2 reaches 24 hours
// in about 17 iterations. So the expected number of iterations needed to
// reach the maximum delay should be less than about 20.
EXPECT_LE(num_iterations, 20u);
for (size_t i = 0; i < num_iterations; i++) {
DisconnectAndRestartMetricsService();
EXPECT_FALSE(service_binding_->has_clients()) << "Iteration " << i;
task_environment_.FastForwardBy(LegacyMetricsClient::kMaxReconnectDelay);
EXPECT_TRUE(service_binding_->has_clients()) << "Iteration " << i;
}
}
TEST_F(LegacyMetricsClientTest, RecordCompletionResetsReconnectDelay) {
StartClientAndExpectConnection();
// First reconnect has initial delay.
DisconnectAndRestartMetricsService();
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay);
// Another reconnect without a successful Record() call increases the delay.
DisconnectAndRestartMetricsService();
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay *
LegacyMetricsClient::kReconnectBackoffFactor);
// Record and report an event, invoking a FIDL Record().
base::RecordComputedAction("ArbitraryEvent");
task_environment_.FastForwardBy(kReportInterval);
EXPECT_TRUE(test_recorder_.IsRecordInFlight());
test_recorder_.SendAck();
base::RunLoop().RunUntilIdle();
// Reconnect after a successful Record() uses the initial delay again.
DisconnectAndRestartMetricsService();
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay);
}
TEST_F(LegacyMetricsClientTest, ContinueRecordingUserActionsAfterDisconnect) {
StartClientAndExpectConnection();
base::RecordComputedAction("BeforeDisconnect");
DisconnectAndRestartMetricsService();
base::RecordComputedAction("DuringDisconnect");
ExpectReconnectAfterDelay(LegacyMetricsClient::kInitialReconnectDelay);
base::RecordComputedAction("AfterReconnect");
// Fast forward to report metrics.
task_environment_.FastForwardBy(kReportInterval);
EXPECT_TRUE(test_recorder_.IsRecordInFlight());
auto events = test_recorder_.WaitForEvents();
EXPECT_THAT(
events,
UnorderedElementsAreArray({
Property(&fuchsia::legacymetrics::Event::user_action_event,
Property(&fuchsia::legacymetrics::UserActionEvent::name,
"BeforeDisconnect")),
Property(&fuchsia::legacymetrics::Event::user_action_event,
Property(&fuchsia::legacymetrics::UserActionEvent::name,
"DuringDisconnect")),
Property(&fuchsia::legacymetrics::Event::user_action_event,
Property(&fuchsia::legacymetrics::UserActionEvent::name,
"AfterReconnect")),
}));
}
TEST_F(LegacyMetricsClientTest, Batching) {

@ -11,6 +11,8 @@
namespace cr_fuchsia {
constexpr size_t LegacyMetricsUserActionRecorder::kMaxEventCount;
LegacyMetricsUserActionRecorder::LegacyMetricsUserActionRecorder()
: on_event_callback_(
base::BindRepeating(&LegacyMetricsUserActionRecorder::OnUserAction,
@ -33,6 +35,9 @@ LegacyMetricsUserActionRecorder::TakeEvents() {
void LegacyMetricsUserActionRecorder::OnUserAction(const std::string& action,
base::TimeTicks time) {
if (events_.size() >= kMaxEventCount)
return;
fuchsia::legacymetrics::UserActionEvent fidl_event;
fidl_event.set_name(action);
fidl_event.set_time(time.ToZxTime());

@ -17,6 +17,9 @@ namespace cr_fuchsia {
// fuchsia.legacymetrics equivalent.
class LegacyMetricsUserActionRecorder {
public:
// Maximum number of Events to store locally before dropping new ones.
static constexpr size_t kMaxEventCount = 5000;
LegacyMetricsUserActionRecorder();
~LegacyMetricsUserActionRecorder();

@ -72,5 +72,19 @@ TEST_F(LegacyMetricsUserActionRecorderTest, EmptyBuffer) {
EXPECT_FALSE(buffer.HasEvents());
}
TEST_F(LegacyMetricsUserActionRecorderTest, EnforcesMaximumEventCount) {
LegacyMetricsUserActionRecorder buffer;
// Try recording twice the maximum number of locally stored events.
for (int i = 0, count = LegacyMetricsUserActionRecorder::kMaxEventCount * 2;
i < count; i++) {
base::RecordComputedAction("Test");
}
EXPECT_TRUE(buffer.HasEvents());
auto events = buffer.TakeEvents();
EXPECT_EQ(LegacyMetricsUserActionRecorder::kMaxEventCount, events.size());
}
} // namespace
} // namespace cr_fuchsia