Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 | 30x 30x 30x 30x 30x 30x 30x | /*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import axios from 'axios';
import { pluralIfNeeded, queryDruidSql } from '../../utils';
import { deepGet } from '../../utils/object-change';
import { postToSampler } from '../../utils/sampler';
export interface CheckControls {
addSuggestion: (message: string) => void;
addIssue: (message: string) => void;
terminateChecks: () => void;
}
export interface DoctorCheck {
name: string;
check: (controls: CheckControls) => Promise<void>;
}
const RUNTIME_PROPERTIES_ALL_NODES_MUST_AGREE_ON: string[] = [
'user.timezone',
'druid.zk.service.host',
];
// In the future (when we can query other services) is will also be cool to check:
// 'druid.storage.type' <=> historicals, overlords, mm
// 'druid.indexer.logs.type' <=> overlord, mm, + peons
const RUNTIME_PROPERTIES_MASTER_NODES_SHOULD_AGREE_ON: string[] = [
'druid.metadata.storage.type', // overlord + coordinator
'druid.metadata.storage.connector.connectURI',
];
export const DOCTOR_CHECKS: DoctorCheck[] = [
// -------------------------------------
// Self (router) checks
// -------------------------------------
{
name: 'Verify own status',
check: async controls => {
// Make sure that the router responds to /status and gives some valid info back
let status: any;
try {
status = (await axios.get(`/status`)).data;
} catch (e) {
controls.addIssue(
`Did not get a /status response from the Router service. Try confirming that it is running and accessible. Got: ${e.message}`,
);
controls.terminateChecks();
return;
}
if (typeof status.version !== 'string') {
controls.addIssue('Could not get a valid /status response from the Router.');
}
},
},
{
name: 'Verify own runtime properties',
check: async controls => {
// Make sure that everything in /status/properties is above board
let properties: Record<string, string>;
try {
properties = (await axios.get(`/status/properties`)).data;
} catch (e) {
controls.addIssue(
`Did not get a /status/properties response from the Router. Message: ${e.message}`,
);
return;
}
// Check that the management proxy is on, it really should be for someone to access the console in the first place but everything could happen
if (properties['druid.router.managementProxy.enabled'] !== 'true') {
controls.addIssue(
`The Router's "druid.router.managementProxy.enabled" is not reported as "true". This means that the Coordinator and Overlord will not be accessible from the Router (and this console).`,
);
}
// Check that the underlying Java is Java 8 the only officially supported Java version at the moment.
if (
properties['java.specification.version'] &&
properties['java.specification.version'] !== '1.8'
) {
controls.addSuggestion(
`It looks like are running Java ${properties['java.runtime.version']}. Druid only officially supports Java 1.8.x`,
);
}
// Check "file.encoding"
if (properties['file.encoding'] && properties['file.encoding'] !== 'UTF-8') {
controls.addSuggestion(
`It looks like "file.encoding" is set to ${properties['file.encoding']}, it is recommended to set this to "UTF-8"`,
);
}
// Check "user.timezone"
if (properties['user.timezone'] && properties['user.timezone'] !== 'UTC') {
controls.addSuggestion(
`It looks like "user.timezone" is set to ${properties['user.timezone']}, it is recommended to set this to "UTC"`,
);
}
},
},
// -------------------------------------
// Coordinator and Overlord
// -------------------------------------
{
name: 'Verify the Coordinator and Overlord status',
check: async controls => {
// Make sure that everything in Coordinator's /status is good
let myStatus: any;
try {
myStatus = (await axios.get(`/status`)).data;
} catch {
return;
}
let coordinatorStatus: any;
try {
coordinatorStatus = (await axios.get(`/proxy/coordinator/status`)).data;
} catch (e) {
controls.addIssue(
'Did not get a /status response from the Coordinator service. Try confirming that it is running and accessible.',
);
return;
}
let overlordStatus: any;
try {
overlordStatus = (await axios.get(`/proxy/overlord/status`)).data;
} catch (e) {
controls.addIssue(
'Did not get a /status response from the Overlord service. Try confirming that it is running and accessible.',
);
return;
}
if (myStatus.version !== coordinatorStatus.version) {
controls.addSuggestion(
`It looks like the Router and Coordinator services are on different versions of Druid. This may indicate a problem if you are not in the middle of a rolling upgrade.`,
);
}
if (myStatus.version !== overlordStatus.version) {
controls.addSuggestion(
`It looks like the Router and Overlord services are on different versions of Druid. This may indicate a problem if you are not in the middle of a rolling upgrade.`,
);
}
},
},
{
name: 'Verify the Coordinator and Overlord runtime properties',
check: async controls => {
// Make sure that everything in coordinator and overlord /status/properties is good and matches where needed
let myProperties: Record<string, string>;
try {
myProperties = (await axios.get(`/status/properties`)).data;
} catch {
return;
}
let coordinatorProperties: Record<string, string>;
try {
coordinatorProperties = (await axios.get(`/proxy/coordinator/status/properties`)).data;
} catch (e) {
controls.addIssue(
'Did not get a /status response from the coordinator. Try confirming that it is running and accessible.',
);
return;
}
let overlordProperties: Record<string, string>;
try {
overlordProperties = (await axios.get(`/proxy/overlord/status/properties`)).data;
} catch (e) {
controls.addIssue(
'Did not get a /status response from the overlord. Try confirming that it is running and accessible.',
);
return;
}
for (const prop of RUNTIME_PROPERTIES_ALL_NODES_MUST_AGREE_ON) {
if (myProperties[prop] !== coordinatorProperties[prop]) {
controls.addIssue(
`The Router and Coordinator do not agree on the "${prop}" runtime property ("${myProperties[prop]}" vs "${coordinatorProperties[prop]}")`,
);
}
if (myProperties[prop] !== overlordProperties[prop]) {
controls.addIssue(
`The Router and Overlord do not agree on the "${prop}" runtime property ("${myProperties[prop]}" vs "${overlordProperties[prop]}")`,
);
}
}
for (const prop of RUNTIME_PROPERTIES_MASTER_NODES_SHOULD_AGREE_ON) {
if (coordinatorProperties[prop] !== overlordProperties[prop]) {
controls.addSuggestion(
`The Coordinator and Overlord do not agree on the "${prop}" runtime property ("${coordinatorProperties[prop]}" vs "${overlordProperties[prop]}")`,
);
}
}
},
},
// -------------------------------------
// Check sampler
// -------------------------------------
{
name: 'Verify that the sampler works',
check: async controls => {
// Make sure that everything in Coordinator's /status is good
let testSampledData: any;
try {
testSampledData = await postToSampler(
{
type: 'index_parallel',
spec: {
type: 'index_parallel',
ioConfig: {
type: 'index_parallel',
inputSource: { type: 'inline', data: '{"test":"Data"}' },
inputFormat: { type: 'json' },
},
dataSchema: {
dataSource: 'sample',
timestampSpec: {
column: '!!!_no_such_column_!!!',
missingValue: '2010-01-01T00:00:00Z',
},
dimensionsSpec: { dimensions: ['test'] },
transformSpec: {},
metricsSpec: [],
granularitySpec: { queryGranularity: 'NONE' },
},
},
samplerConfig: {
numRows: 50,
timeoutMs: 1000,
},
},
'doctor',
);
} catch {
controls.addIssue(`Could not use the sampler.`);
return;
}
if (deepGet(testSampledData, 'data.0.parsed.test') !== 'Data') {
controls.addIssue(`Sampler returned incorrect data.`);
}
},
},
// -------------------------------------
// Check SQL
// -------------------------------------
{
name: 'Verify that SQL works',
check: async controls => {
// Make sure that we can run the simplest query
let sqlResult: any[];
try {
sqlResult = await queryDruidSql({ query: `SELECT 1 + 1 AS "two"` });
} catch (e) {
controls.addIssue(
`Could not query SQL ensure that "druid.sql.enable" is set to "true" and that there is a Broker service running. Got: ${e.message}`,
);
controls.terminateChecks();
return;
}
if (sqlResult.length !== 1 || sqlResult[0]['two'] !== 2) {
controls.addIssue(`Got incorrect results from a basic SQL query.`);
}
},
},
{
name: 'Verify that there are historical services',
check: async controls => {
// Make sure that there are broker and historical services reported from sys.servers
let sqlResult: any[];
try {
sqlResult = await queryDruidSql({
query: `SELECT
COUNT(*) AS "historicals"
FROM sys.servers
WHERE "server_type" = 'historical'`,
});
} catch (e) {
controls.addIssue(`Could not run a sys.servers query. Got: ${e.message}`);
return;
}
if (sqlResult.length === 1 && sqlResult[0]['historicals'] === 0) {
controls.addIssue(`There do not appear to be any historical services.`);
}
},
},
{
name: 'Verify that the historicals are not overfilled',
check: async controls => {
// Make sure that no services are reported that are over 95% capacity
let sqlResult: any[];
try {
sqlResult = await queryDruidSql({
query: `SELECT
"server" AS "service",
"curr_size" * 1.0 / "max_size" AS "fill"
FROM sys.servers
WHERE "server_type" = 'historical' AND "curr_size" * 1.0 / "max_size" > 0.9
ORDER BY "server" DESC`,
});
} catch (e) {
controls.addIssue(`Could not run a sys.servers query. Got: ${e.message}`);
return;
}
function formatPercent(service: any): string {
return (service['fill'] * 100).toFixed(2);
}
for (const service of sqlResult) {
if (service['fill'] > 0.95) {
controls.addIssue(
`Historical "${service['service']}" appears to be over 95% full (is ${formatPercent(
service,
)}%). Increase capacity.`,
);
} else {
controls.addSuggestion(
`Historical "${service['service']}" appears to be over 90% full (is ${formatPercent(
service,
)}%)`,
);
}
}
},
},
{
name: 'Look for time chunks that could benefit from compaction',
check: async controls => {
// Check for any time chunks where there is more than 1 segment and avg segment size is less than 100MB
const dayAgo = new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString();
let sqlResult: any[];
try {
sqlResult = await queryDruidSql({
query: `SELECT
"datasource",
COUNT(*) AS "num_bad_time_chunks"
FROM (
SELECT
"datasource", "start", "end",
AVG("size") AS "avg_segment_size_in_time_chunk",
SUM("size") AS "total_size",
COUNT(*) AS "num_segments"
FROM sys.segments
WHERE is_published = 1 AND "start" < '${dayAgo}'
GROUP BY 1, 2, 3
HAVING "num_segments" > 1 AND "total_size" > 1 AND "avg_segment_size_in_time_chunk" < 100000000
)
GROUP BY 1
ORDER BY "num_bad_time_chunks"`,
});
} catch (e) {
return;
}
if (sqlResult.length) {
// Grab the auto-compaction definitions and ignore dataSources that already have auto-compaction
let compactionResult: any;
try {
compactionResult = (await axios.get('/druid/coordinator/v1/config/compaction')).data;
} catch (e) {
controls.addIssue(`Could not get compaction config. Something is wrong.`);
return;
}
if (!compactionResult.compactionConfigs) return;
if (!Array.isArray(compactionResult.compactionConfigs)) {
controls.addIssue(`Got invalid value from compaction config. Something is wrong.`);
return;
}
const dataSourcesWithCompaction = compactionResult.compactionConfigs.map(
(d: any) => d.dataSource,
);
sqlResult = sqlResult.filter(d => !dataSourcesWithCompaction.includes(d['datasource']));
for (const datasource of sqlResult) {
controls.addSuggestion(
`Datasource "${
datasource['datasource']
}" could benefit from auto-compaction as it has ${pluralIfNeeded(
datasource['num_bad_time_chunks'],
'time chunk',
)} that have multiple small segments that could be compacted.`,
);
}
}
},
},
];
|