From 90d3f0fb611e5411cca2a35347be7c9cef3ff200 Mon Sep 17 00:00:00 2001 From: Felix Ableitner Date: Sat, 4 Jul 2020 21:58:43 +0200 Subject: [PATCH] Add telegraf for monitoring --- .gitignore | 1 + files/nginx_status.conf | 7 + group_vars/prod.yml | 9 +- peertube.yml => playbooks/peertube.yml | 0 playbooks/site.yml | 3 + playbooks/telegraf.yml | 45 +++ templates/telegraf.conf.j2 | 445 +++++++++++++++++++++++++ 7 files changed, 509 insertions(+), 1 deletion(-) create mode 100644 files/nginx_status.conf rename peertube.yml => playbooks/peertube.yml (100%) create mode 100644 playbooks/site.yml create mode 100644 playbooks/telegraf.yml create mode 100644 templates/telegraf.conf.j2 diff --git a/.gitignore b/.gitignore index d90ddbd..062f03c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ passwords/ peertube.retry prod .idea/ +vault_pass \ No newline at end of file diff --git a/files/nginx_status.conf b/files/nginx_status.conf new file mode 100644 index 0000000..323a481 --- /dev/null +++ b/files/nginx_status.conf @@ -0,0 +1,7 @@ +server { + listen 8090; + location /nginx_status { + stub_status; + access_log off; + } +} diff --git a/group_vars/prod.yml b/group_vars/prod.yml index 8b43ab1..567d752 100644 --- a/group_vars/prod.yml +++ b/group_vars/prod.yml @@ -13,4 +13,11 @@ postgres_password: !vault | 3030383263346432633336616139373131633161313435650a653037346238383835343664393766 37316234373533363131376338393832353363383931663035613030623631343364336362303536 6363353665343463350a353631356565316638303565663933393338386131346663623932323463 - 62393934383936346566663338636137303132313039353137666561303039373961 \ No newline at end of file + 62393934383936346566663338636137303132313039353137666561303039373961 +telegraf_influxdb_password: !vault | + $ANSIBLE_VAULT;1.1;AES256 + 61343966363633306163646530646361613833663831623139376135396436623835333363663236 + 3235613761363138313236636164646131383234313532370a626234643530373339646133313332 + 36623563623434323336663262323939326534643834666465333863386231616439636132316436 + 3833303337393633320a313766336236303264376333373535353832646262666634383062303935 + 62393230366331396435313162636136333832623939666663623131343761633031 \ No newline at end of file diff --git a/peertube.yml b/playbooks/peertube.yml similarity index 100% rename from peertube.yml rename to playbooks/peertube.yml diff --git a/playbooks/site.yml b/playbooks/site.yml new file mode 100644 index 0000000..ac1070e --- /dev/null +++ b/playbooks/site.yml @@ -0,0 +1,3 @@ +--- +- import_playbook: peertube.yml +- import_playbook: telegraf.yml diff --git a/playbooks/telegraf.yml b/playbooks/telegraf.yml new file mode 100644 index 0000000..6e8884f --- /dev/null +++ b/playbooks/telegraf.yml @@ -0,0 +1,45 @@ +--- +- hosts: all + + tasks: + # TODO: peertube uses docker nginx container + #- name: copy nginx files + # copy: + # src: '../files/nginx_status.conf' + # dest: '/etc/nginx/sites-enabled/nginx_status.conf' + + - name: add telegraf apt key + apt_key: + keyserver: https://repos.influxdata.com/influxdb.key + id: 684A14CF2582E0C5 + state: present + + - name: add telegraf apt repository + apt_repository: + # Note: we need to adjust this manually for different ubuntu versions + repo: 'deb https://repos.influxdata.com/ubuntu bionic stable' + state: present + filename: influxdb + update_cache: yes + + - name: add telegraf to docker group + action: user name=telegraf groups="docker" append=yes + + - name: install telegraf + apt: + name: telegraf + state: present + + - name: add telegraf config + template: + src: '../templates/telegraf.conf.j2' + dest: '/etc/telegraf/telegraf.conf' + owner: telegraf + group: telegraf + mode: '0600' + + - name: start and enable telegraf service + systemd: + state: reloaded + name: telegraf + enabled: true diff --git a/templates/telegraf.conf.j2 b/templates/telegraf.conf.j2 new file mode 100644 index 0000000..f541afa --- /dev/null +++ b/templates/telegraf.conf.j2 @@ -0,0 +1,445 @@ + +# Telegraf Configuration +# +# Telegraf is entirely plugin driven. All metrics are gathered from the +# declared inputs, and sent to the declared outputs. +# +# Plugins must be declared in here to be active. +# To deactivate a plugin, comment out the name and any variables. +# +# Use 'telegraf -config telegraf.conf -test' to see what metrics a config +# file would generate. +# +# Environment variables can be used anywhere in this config file, simply surround +# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"), +# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR}) + + +# Global tags can be specified here in key="value" format. +[global_tags] +# dc = "us-east-1" # will tag all metrics with dc=us-east-1 +# rack = "1a" +## Environment variables can be used as tags, and throughout the config file +# user = "$USER" + + +# Configuration for telegraf agent +[agent] +## Default data collection interval for all inputs +interval = "10s" +## Rounds collection interval to 'interval' +## ie, if interval="10s" then always collect on :00, :10, :20, etc. +round_interval = true + +## Telegraf will send metrics to outputs in batches of at most +## metric_batch_size metrics. +## This controls the size of writes that Telegraf sends to output plugins. +metric_batch_size = 1000 + +## Maximum number of unwritten metrics per output. Increasing this value +## allows for longer periods of output downtime without dropping metrics at the +## cost of higher maximum memory usage. +metric_buffer_limit = 10000 + +## Collection jitter is used to jitter the collection by a random amount. +## Each plugin will sleep for a random time within jitter before collecting. +## This can be used to avoid many plugins querying things like sysfs at the +## same time, which can have a measurable effect on the system. +collection_jitter = "0s" + +## Default flushing interval for all outputs. Maximum flush_interval will be +## flush_interval + flush_jitter +flush_interval = "10s" +## Jitter the flush interval by a random amount. This is primarily to avoid +## large write spikes for users running a large number of telegraf instances. +## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s +flush_jitter = "0s" + +## By default or when set to "0s", precision will be set to the same +## timestamp order as the collection interval, with the maximum being 1s. +## ie, when interval = "10s", precision will be "1s" +## when interval = "250ms", precision will be "1ms" +## Precision will NOT be used for service inputs. It is up to each individual +## service input to set the timestamp at the appropriate precision. +## Valid time units are "ns", "us" (or "µs"), "ms", "s". +precision = "" + +## Log at debug level. +# debug = false +## Log only error level messages. +# quiet = false + +## Log target controls the destination for logs and can be one of "file", +## "stderr" or, on Windows, "eventlog". When set to "file", the output file +## is determined by the "logfile" setting. +# logtarget = "file" + +## Name of the file to be logged to when using the "file" logtarget. If set to +## the empty string then logs are written to stderr. +# logfile = "" + +## The logfile will be rotated after the time interval specified. When set +## to 0 no time based rotation is performed. Logs are rotated only when +## written to, if there is no log activity rotation may be delayed. +# logfile_rotation_interval = "0d" + +## The logfile will be rotated when it becomes larger than the specified +## size. When set to 0 no size based rotation is performed. +# logfile_rotation_max_size = "0MB" + +## Maximum number of rotated archives to keep, any older logs are deleted. +## If set to -1, no archives are removed. +# logfile_rotation_max_archives = 5 + +## Override default hostname, if empty use os.Hostname() +hostname = "" +## If set to true, do no set the "host" tag in the telegraf agent. +omit_hostname = false + + +############################################################################### +# OUTPUT PLUGINS # +############################################################################### + + +# Configuration for sending metrics to InfluxDB +[[outputs.influxdb]] +## The full HTTP or UDP URL for your InfluxDB instance. +## +## Multiple URLs can be specified for a single cluster, only ONE of the +## urls will be written to each interval. +# urls = ["unix:///var/run/influxdb.sock"] +# urls = ["udp://127.0.0.1:8089"] +# urls = ["http://127.0.0.1:8086"] +urls = ["https://grafana.yerbamate.dev/telegraf-input"] + +## The target database for metrics; will be created as needed. +## For UDP url endpoint database needs to be configured on server side. +database = "peertube-social" + +## The value of this tag will be used to determine the database. If this +## tag is not set the 'database' option is used as the default. +# database_tag = "" + +## If true, the 'database_tag' will not be included in the written metric. +# exclude_database_tag = false + +## If true, no CREATE DATABASE queries will be sent. Set to true when using +## Telegraf with a user without permissions to create databases or when the +## database already exists. +# skip_database_creation = false + +## Name of existing retention policy to write to. Empty string writes to +## the default retention policy. Only takes effect when using HTTP. +# retention_policy = "" + +## The value of this tag will be used to determine the retention policy. If this +## tag is not set the 'retention_policy' option is used as the default. +# retention_policy_tag = "" + +## If true, the 'retention_policy_tag' will not be included in the written metric. +# exclude_retention_policy_tag = false + +## Write consistency (clusters only), can be: "any", "one", "quorum", "all". +## Only takes effect when using HTTP. +# write_consistency = "any" + +## Timeout for HTTP messages. +# timeout = "5s" + +## HTTP Basic Auth +username = "telegraf" +password = "{{ telegraf_influxdb_password }}" + +## HTTP User-Agent +# user_agent = "telegraf" + +## UDP payload size is the maximum packet size to send. +# udp_payload = "512B" + +## Optional TLS Config for use on HTTP connections. +# tls_ca = "/etc/telegraf/ca.pem" +# tls_cert = "/etc/telegraf/cert.pem" +# tls_key = "/etc/telegraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = false + +## HTTP Proxy override, if unset values the standard proxy environment +## variables are consulted to determine which proxy, if any, should be used. +# http_proxy = "http://corporate.proxy:3128" + +## Additional HTTP headers +# http_headers = {"X-Special-Header" = "Special-Value"} + +## HTTP Content-Encoding for write request body, can be set to "gzip" to +## compress body or "identity" to apply no encoding. +# content_encoding = "identity" + +## When true, Telegraf will output unsigned integers as unsigned values, +## i.e.: "42u". You will need a version of InfluxDB supporting unsigned +## integer values. Enabling this option will result in field type errors if +## existing data has been written. +# influx_uint_support = false + + +############################################################################### +# INPUT PLUGINS # +############################################################################### + + +# Read metrics about cpu usage +[[inputs.cpu]] +## Whether to report per-cpu stats or not +percpu = true +## Whether to report total system cpu stats or not +totalcpu = true +## If true, collect raw CPU time metrics. +collect_cpu_time = false +## If true, compute and report the sum of all non-idle CPU states. +report_active = false + + +# Read metrics about disk usage by mount point +[[inputs.disk]] +## By default stats will be gathered for all mount points. +## Set mount_points will restrict the stats to only the specified mount points. +# mount_points = ["/"] + +## Ignore mount points by filesystem type. +ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"] + + +# Read metrics about disk IO by device +[[inputs.diskio]] +## By default, telegraf will gather stats for all devices including +## disk partitions. +## Setting devices will restrict the stats to the specified devices. +# devices = ["sda", "sdb", "vd*"] +## Uncomment the following line if you need disk serial numbers. +# skip_serial_number = false +# +## On systems which support it, device metadata can be added in the form of +## tags. +## Currently only Linux is supported via udev properties. You can view +## available properties for a device by running: +## 'udevadm info -q property -n /dev/sda' +## Note: Most, but not all, udev properties can be accessed this way. Properties +## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH. +# device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"] +# +## Using the same metadata source as device_tags, you can also customize the +## name of the device via templates. +## The 'name_templates' parameter is a list of templates to try and apply to +## the device. The template may contain variables in the form of '$PROPERTY' or +## '${PROPERTY}'. The first template which does not contain any variables not +## present for the device is used as the device name tag. +## The typical use case is for LVM volumes, to get the VG/LV name instead of +## the near-meaningless DM-0 name. +# name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"] + + +# Get kernel statistics from /proc/stat +[[inputs.kernel]] +# no configuration + + +# Read metrics about memory usage +[[inputs.mem]] +# no configuration + + +# Get the number of processes and group them by status +[[inputs.processes]] +# no configuration + + +# Read metrics about swap memory usage +[[inputs.swap]] +# no configuration + + +# Read metrics about system load & uptime +[[inputs.system]] +## Uncomment to remove deprecated metrics. +fielddrop = ["uptime_format"] + + +[[inputs.net]] +interfaces = ["eth0"] + + +# Read metrics about docker containers +[[inputs.docker]] +## Docker Endpoint +## To use TCP, set endpoint = "tcp://[ip]:[port]" +## To use environment variables (ie, docker-machine), set endpoint = "ENV" +endpoint = "unix:///var/run/docker.sock" + +## Set to true to collect Swarm metrics(desired_replicas, running_replicas) +gather_services = false + +## Only collect metrics for these containers, collect all if empty +container_names = [] + +## Set the source tag for the metrics to the container ID hostname, eg first 12 chars +source_tag = false + +## Containers to include and exclude. Globs accepted. +## Note that an empty array for both will include all containers +container_name_include = [] +container_name_exclude = [] + +## Container states to include and exclude. Globs accepted. +## When empty only containers in the "running" state will be captured. +## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] +## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"] +# container_state_include = [] +# container_state_exclude = [] + +## Timeout for docker list, info, and stats commands +timeout = "5s" + +## Whether to report for each container per-device blkio (8:0, 8:1...) and +## network (eth0, eth1, ...) stats or not +perdevice = true + +## Whether to report for each container total blkio and network stats or not +total = false + +## Which environment variables should we use as a tag +##tag_env = ["JAVA_HOME", "HEAP_SIZE"] + +## docker labels to include and exclude as tags. Globs accepted. +## Note that an empty array for both will include all labels as tags +docker_label_include = [] +docker_label_exclude = [] + +## Optional TLS Config +# tls_ca = "/etc/telegraf/ca.pem" +# tls_cert = "/etc/telegraf/cert.pem" +# tls_key = "/etc/telegraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = false + + +# Read Nginx's basic status information (ngx_http_stub_status_module) +#[[inputs.nginx]] +# An array of Nginx stub_status URI to gather stats. +#urls = ["http://localhost:8090/nginx_status"] +## Optional TLS Config +# tls_ca = "/etc/telegraf/ca.pem" +# tls_cert = "/etc/telegraf/cert.cer" +# tls_key = "/etc/telegraf/key.key" +## Use TLS but skip chain & host verification +# insecure_skip_verify = false +# HTTP response timeout (default: 5s) +# response_timeout = "5s" + + +# # Read nginx_upstream_check module status information (https://github.com/yaoweibin/nginx_upstream_check_module) +# [[inputs.nginx_upstream_check]] +# ## An URL where Nginx Upstream check module is enabled +# ## It should be set to return a JSON formatted response +# url = "http://127.0.0.1/status?format=json" +# +# ## HTTP method +# # method = "GET" +# +# ## Optional HTTP headers +# # headers = {"X-Special-Header" = "Special-Value"} +# +# ## Override HTTP "Host" header +# # host_header = "check.example.com" +# +# ## Timeout for HTTP requests +# timeout = "5s" +# +# ## Optional HTTP Basic Auth credentials +# # username = "username" +# # password = "pa$$word" +# +# ## Optional TLS Config +# # tls_ca = "/etc/telegraf/ca.pem" +# # tls_cert = "/etc/telegraf/cert.pem" +# # tls_key = "/etc/telegraf/key.pem" +# ## Use TLS but skip chain & host verification +# # insecure_skip_verify = false + + +############################################################################### +# SERVICE INPUT PLUGINS # +############################################################################### + + +# Read logging output from the Docker engine +[[inputs.docker_log]] +# Docker Endpoint +# To use TCP, set endpoint = "tcp://[ip]:[port]" +# To use environment variables (ie, docker-machine), set endpoint = "ENV" +endpoint = "unix:///var/run/docker.sock" +# When true, container logs are read from the beginning; otherwise +# reading begins at the end of the log. +from_beginning = false + +## Timeout for Docker API calls. +# timeout = "5s" + +## Containers to include and exclude. Globs accepted. +## Note that an empty array for both will include all containers +# container_name_include = [] +# container_name_exclude = [] + +## Container states to include and exclude. Globs accepted. +## When empty only containers in the "running" state will be captured. +# container_state_include = [] +# container_state_exclude = [] + +## docker labels to include and exclude as tags. Globs accepted. +## Note that an empty array for both will include all labels as tags +# docker_label_include = [] +# docker_label_exclude = [] + +## Set the source tag for the metrics to the container ID hostname, eg first 12 chars +source_tag = false + +## Optional TLS Config +# tls_ca = "/etc/telegraf/ca.pem" +# tls_cert = "/etc/telegraf/cert.pem" +# tls_key = "/etc/telegraf/key.pem" +## Use TLS but skip chain & host verification +# insecure_skip_verify = false + + +# # Read metrics from one or many postgresql servers +# [[inputs.postgresql]] +# ## specify address via a url matching: +# ## postgres://[pqgotest[:password]]@localhost[/dbname]\ +# ## ?sslmode=[disable|verify-ca|verify-full] +# ## or a simple string: +# ## host=localhost user=pqotest password=... sslmode=... dbname=app_production +# ## +# ## All connection parameters are optional. +# ## +# ## Without the dbname parameter, the driver will default to a database +# ## with the same name as the user. This dbname is just for instantiating a +# ## connection with the server and doesn't restrict the databases we are trying +# ## to grab metrics for. +# ## +# address = "host=localhost user=postgres sslmode=disable" +# ## A custom name for the database that will be used as the "server" tag in the +# ## measurement output. If not specified, a default one generated from +# ## the connection address is used. +# # outputaddress = "db01" +# +# ## connection configuration. +# ## maxlifetime - specify the maximum lifetime of a connection. +# ## default is forever (0s) +# max_lifetime = "0s" +# +# ## A list of databases to explicitly ignore. If not specified, metrics for all +# ## databases are gathered. Do NOT use with the 'databases' option. +# # ignored_databases = ["postgres", "template0", "template1"] +# +# ## A list of databases to pull metrics about. If not specified, metrics for all +# ## databases are gathered. Do NOT use with the 'ignored_databases' option. +# # databases = ["app_production", "testing"]