From 90d3f0fb611e5411cca2a35347be7c9cef3ff200 Mon Sep 17 00:00:00 2001
From: Felix Ableitner <me@nutomic.com>
Date: Sat, 4 Jul 2020 21:58:43 +0200
Subject: [PATCH] Add telegraf for monitoring

---
 .gitignore                             |   1 +
 files/nginx_status.conf                |   7 +
 group_vars/prod.yml                    |   9 +-
 peertube.yml => playbooks/peertube.yml |   0
 playbooks/site.yml                     |   3 +
 playbooks/telegraf.yml                 |  45 +++
 templates/telegraf.conf.j2             | 445 +++++++++++++++++++++++++
 7 files changed, 509 insertions(+), 1 deletion(-)
 create mode 100644 files/nginx_status.conf
 rename peertube.yml => playbooks/peertube.yml (100%)
 create mode 100644 playbooks/site.yml
 create mode 100644 playbooks/telegraf.yml
 create mode 100644 templates/telegraf.conf.j2

diff --git a/.gitignore b/.gitignore
index d90ddbd..062f03c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,4 @@ passwords/
 peertube.retry
 prod
 .idea/
+vault_pass
\ No newline at end of file
diff --git a/files/nginx_status.conf b/files/nginx_status.conf
new file mode 100644
index 0000000..323a481
--- /dev/null
+++ b/files/nginx_status.conf
@@ -0,0 +1,7 @@
+server {
+    listen 8090;
+    location /nginx_status {
+        stub_status;
+        access_log off;
+    }
+}
diff --git a/group_vars/prod.yml b/group_vars/prod.yml
index 8b43ab1..567d752 100644
--- a/group_vars/prod.yml
+++ b/group_vars/prod.yml
@@ -13,4 +13,11 @@ postgres_password: !vault |
   3030383263346432633336616139373131633161313435650a653037346238383835343664393766
   37316234373533363131376338393832353363383931663035613030623631343364336362303536
   6363353665343463350a353631356565316638303565663933393338386131346663623932323463
-  62393934383936346566663338636137303132313039353137666561303039373961
\ No newline at end of file
+  62393934383936346566663338636137303132313039353137666561303039373961
+telegraf_influxdb_password: !vault |
+  $ANSIBLE_VAULT;1.1;AES256
+  61343966363633306163646530646361613833663831623139376135396436623835333363663236
+  3235613761363138313236636164646131383234313532370a626234643530373339646133313332
+  36623563623434323336663262323939326534643834666465333863386231616439636132316436
+  3833303337393633320a313766336236303264376333373535353832646262666634383062303935
+  62393230366331396435313162636136333832623939666663623131343761633031
\ No newline at end of file
diff --git a/peertube.yml b/playbooks/peertube.yml
similarity index 100%
rename from peertube.yml
rename to playbooks/peertube.yml
diff --git a/playbooks/site.yml b/playbooks/site.yml
new file mode 100644
index 0000000..ac1070e
--- /dev/null
+++ b/playbooks/site.yml
@@ -0,0 +1,3 @@
+---
+- import_playbook: peertube.yml
+- import_playbook: telegraf.yml
diff --git a/playbooks/telegraf.yml b/playbooks/telegraf.yml
new file mode 100644
index 0000000..6e8884f
--- /dev/null
+++ b/playbooks/telegraf.yml
@@ -0,0 +1,45 @@
+---
+- hosts: all
+
+  tasks:
+  # TODO: peertube uses docker nginx container
+  #- name: copy nginx files
+  #  copy:
+  #    src: '../files/nginx_status.conf'
+  #    dest: '/etc/nginx/sites-enabled/nginx_status.conf'
+
+  - name: add telegraf apt key
+    apt_key:
+      keyserver: https://repos.influxdata.com/influxdb.key
+      id: 684A14CF2582E0C5
+      state: present
+
+  - name: add telegraf apt repository
+    apt_repository:
+      # Note: we need to adjust this manually for different ubuntu versions
+      repo: 'deb https://repos.influxdata.com/ubuntu bionic stable'
+      state: present
+      filename: influxdb
+      update_cache: yes
+
+  - name: add telegraf to docker group
+    action: user name=telegraf groups="docker" append=yes
+
+  - name: install telegraf
+    apt:
+      name: telegraf
+      state: present
+
+  - name:  add telegraf config
+    template:
+      src: '../templates/telegraf.conf.j2'
+      dest: '/etc/telegraf/telegraf.conf'
+      owner: telegraf
+      group: telegraf
+      mode: '0600'
+
+  - name: start and enable telegraf service
+    systemd:
+      state: reloaded
+      name: telegraf
+      enabled: true
diff --git a/templates/telegraf.conf.j2 b/templates/telegraf.conf.j2
new file mode 100644
index 0000000..f541afa
--- /dev/null
+++ b/templates/telegraf.conf.j2
@@ -0,0 +1,445 @@
+
+# Telegraf Configuration
+#
+# Telegraf is entirely plugin driven. All metrics are gathered from the
+# declared inputs, and sent to the declared outputs.
+#
+# Plugins must be declared in here to be active.
+# To deactivate a plugin, comment out the name and any variables.
+#
+# Use 'telegraf -config telegraf.conf -test' to see what metrics a config
+# file would generate.
+#
+# Environment variables can be used anywhere in this config file, simply surround
+# them with ${}. For strings the variable must be within quotes (ie, "${STR_VAR}"),
+# for numbers and booleans they should be plain (ie, ${INT_VAR}, ${BOOL_VAR})
+
+
+# Global tags can be specified here in key="value" format.
+[global_tags]
+# dc = "us-east-1" # will tag all metrics with dc=us-east-1
+# rack = "1a"
+## Environment variables can be used as tags, and throughout the config file
+# user = "$USER"
+
+
+# Configuration for telegraf agent
+[agent]
+## Default data collection interval for all inputs
+interval = "10s"
+## Rounds collection interval to 'interval'
+## ie, if interval="10s" then always collect on :00, :10, :20, etc.
+round_interval = true
+
+## Telegraf will send metrics to outputs in batches of at most
+## metric_batch_size metrics.
+## This controls the size of writes that Telegraf sends to output plugins.
+metric_batch_size = 1000
+
+## Maximum number of unwritten metrics per output.  Increasing this value
+## allows for longer periods of output downtime without dropping metrics at the
+## cost of higher maximum memory usage.
+metric_buffer_limit = 10000
+
+## Collection jitter is used to jitter the collection by a random amount.
+## Each plugin will sleep for a random time within jitter before collecting.
+## This can be used to avoid many plugins querying things like sysfs at the
+## same time, which can have a measurable effect on the system.
+collection_jitter = "0s"
+
+## Default flushing interval for all outputs. Maximum flush_interval will be
+## flush_interval + flush_jitter
+flush_interval = "10s"
+## Jitter the flush interval by a random amount. This is primarily to avoid
+## large write spikes for users running a large number of telegraf instances.
+## ie, a jitter of 5s and interval 10s means flushes will happen every 10-15s
+flush_jitter = "0s"
+
+## By default or when set to "0s", precision will be set to the same
+## timestamp order as the collection interval, with the maximum being 1s.
+##   ie, when interval = "10s", precision will be "1s"
+##       when interval = "250ms", precision will be "1ms"
+## Precision will NOT be used for service inputs. It is up to each individual
+## service input to set the timestamp at the appropriate precision.
+## Valid time units are "ns", "us" (or "µs"), "ms", "s".
+precision = ""
+
+## Log at debug level.
+# debug = false
+## Log only error level messages.
+# quiet = false
+
+## Log target controls the destination for logs and can be one of "file",
+## "stderr" or, on Windows, "eventlog".  When set to "file", the output file
+## is determined by the "logfile" setting.
+# logtarget = "file"
+
+## Name of the file to be logged to when using the "file" logtarget.  If set to
+## the empty string then logs are written to stderr.
+# logfile = ""
+
+## The logfile will be rotated after the time interval specified.  When set
+## to 0 no time based rotation is performed.  Logs are rotated only when
+## written to, if there is no log activity rotation may be delayed.
+# logfile_rotation_interval = "0d"
+
+## The logfile will be rotated when it becomes larger than the specified
+## size.  When set to 0 no size based rotation is performed.
+# logfile_rotation_max_size = "0MB"
+
+## Maximum number of rotated archives to keep, any older logs are deleted.
+## If set to -1, no archives are removed.
+# logfile_rotation_max_archives = 5
+
+## Override default hostname, if empty use os.Hostname()
+hostname = ""
+## If set to true, do no set the "host" tag in the telegraf agent.
+omit_hostname = false
+
+
+###############################################################################
+#                            OUTPUT PLUGINS                                   #
+###############################################################################
+
+
+# Configuration for sending metrics to InfluxDB
+[[outputs.influxdb]]
+## The full HTTP or UDP URL for your InfluxDB instance.
+##
+## Multiple URLs can be specified for a single cluster, only ONE of the
+## urls will be written to each interval.
+# urls = ["unix:///var/run/influxdb.sock"]
+# urls = ["udp://127.0.0.1:8089"]
+# urls = ["http://127.0.0.1:8086"]
+urls = ["https://grafana.yerbamate.dev/telegraf-input"]
+
+## The target database for metrics; will be created as needed.
+## For UDP url endpoint database needs to be configured on server side.
+database = "peertube-social"
+
+## The value of this tag will be used to determine the database.  If this
+## tag is not set the 'database' option is used as the default.
+# database_tag = ""
+
+## If true, the 'database_tag' will not be included in the written metric.
+# exclude_database_tag = false
+
+## If true, no CREATE DATABASE queries will be sent.  Set to true when using
+## Telegraf with a user without permissions to create databases or when the
+## database already exists.
+# skip_database_creation = false
+
+## Name of existing retention policy to write to.  Empty string writes to
+## the default retention policy.  Only takes effect when using HTTP.
+# retention_policy = ""
+
+## The value of this tag will be used to determine the retention policy.  If this
+## tag is not set the 'retention_policy' option is used as the default.
+# retention_policy_tag = ""
+
+## If true, the 'retention_policy_tag' will not be included in the written metric.
+# exclude_retention_policy_tag = false
+
+## Write consistency (clusters only), can be: "any", "one", "quorum", "all".
+## Only takes effect when using HTTP.
+# write_consistency = "any"
+
+## Timeout for HTTP messages.
+# timeout = "5s"
+
+## HTTP Basic Auth
+username = "telegraf"
+password = "{{ telegraf_influxdb_password }}"
+
+## HTTP User-Agent
+# user_agent = "telegraf"
+
+## UDP payload size is the maximum packet size to send.
+# udp_payload = "512B"
+
+## Optional TLS Config for use on HTTP connections.
+# tls_ca = "/etc/telegraf/ca.pem"
+# tls_cert = "/etc/telegraf/cert.pem"
+# tls_key = "/etc/telegraf/key.pem"
+## Use TLS but skip chain & host verification
+# insecure_skip_verify = false
+
+## HTTP Proxy override, if unset values the standard proxy environment
+## variables are consulted to determine which proxy, if any, should be used.
+# http_proxy = "http://corporate.proxy:3128"
+
+## Additional HTTP headers
+# http_headers = {"X-Special-Header" = "Special-Value"}
+
+## HTTP Content-Encoding for write request body, can be set to "gzip" to
+## compress body or "identity" to apply no encoding.
+# content_encoding = "identity"
+
+## When true, Telegraf will output unsigned integers as unsigned values,
+## i.e.: "42u".  You will need a version of InfluxDB supporting unsigned
+## integer values.  Enabling this option will result in field type errors if
+## existing data has been written.
+# influx_uint_support = false
+
+
+###############################################################################
+#                            INPUT PLUGINS                                    #
+###############################################################################
+
+
+# Read metrics about cpu usage
+[[inputs.cpu]]
+## Whether to report per-cpu stats or not
+percpu = true
+## Whether to report total system cpu stats or not
+totalcpu = true
+## If true, collect raw CPU time metrics.
+collect_cpu_time = false
+## If true, compute and report the sum of all non-idle CPU states.
+report_active = false
+
+
+# Read metrics about disk usage by mount point
+[[inputs.disk]]
+## By default stats will be gathered for all mount points.
+## Set mount_points will restrict the stats to only the specified mount points.
+# mount_points = ["/"]
+
+## Ignore mount points by filesystem type.
+ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay", "aufs", "squashfs"]
+
+
+# Read metrics about disk IO by device
+[[inputs.diskio]]
+## By default, telegraf will gather stats for all devices including
+## disk partitions.
+## Setting devices will restrict the stats to the specified devices.
+# devices = ["sda", "sdb", "vd*"]
+## Uncomment the following line if you need disk serial numbers.
+# skip_serial_number = false
+#
+## On systems which support it, device metadata can be added in the form of
+## tags.
+## Currently only Linux is supported via udev properties. You can view
+## available properties for a device by running:
+## 'udevadm info -q property -n /dev/sda'
+## Note: Most, but not all, udev properties can be accessed this way. Properties
+## that are currently inaccessible include DEVTYPE, DEVNAME, and DEVPATH.
+# device_tags = ["ID_FS_TYPE", "ID_FS_USAGE"]
+#
+## Using the same metadata source as device_tags, you can also customize the
+## name of the device via templates.
+## The 'name_templates' parameter is a list of templates to try and apply to
+## the device. The template may contain variables in the form of '$PROPERTY' or
+## '${PROPERTY}'. The first template which does not contain any variables not
+## present for the device is used as the device name tag.
+## The typical use case is for LVM volumes, to get the VG/LV name instead of
+## the near-meaningless DM-0 name.
+# name_templates = ["$ID_FS_LABEL","$DM_VG_NAME/$DM_LV_NAME"]
+
+
+# Get kernel statistics from /proc/stat
+[[inputs.kernel]]
+# no configuration
+
+
+# Read metrics about memory usage
+[[inputs.mem]]
+# no configuration
+
+
+# Get the number of processes and group them by status
+[[inputs.processes]]
+# no configuration
+
+
+# Read metrics about swap memory usage
+[[inputs.swap]]
+# no configuration
+
+
+# Read metrics about system load & uptime
+[[inputs.system]]
+## Uncomment to remove deprecated metrics.
+fielddrop = ["uptime_format"]
+
+
+[[inputs.net]]
+interfaces = ["eth0"]
+
+
+# Read metrics about docker containers
+[[inputs.docker]]
+## Docker Endpoint
+##   To use TCP, set endpoint = "tcp://[ip]:[port]"
+##   To use environment variables (ie, docker-machine), set endpoint = "ENV"
+endpoint = "unix:///var/run/docker.sock"
+
+## Set to true to collect Swarm metrics(desired_replicas, running_replicas)
+gather_services = false
+
+## Only collect metrics for these containers, collect all if empty
+container_names = []
+
+## Set the source tag for the metrics to the container ID hostname, eg first 12 chars
+source_tag = false
+
+## Containers to include and exclude. Globs accepted.
+## Note that an empty array for both will include all containers
+container_name_include = []
+container_name_exclude = []
+
+## Container states to include and exclude. Globs accepted.
+## When empty only containers in the "running" state will be captured.
+## example: container_state_include = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
+## example: container_state_exclude = ["created", "restarting", "running", "removing", "paused", "exited", "dead"]
+# container_state_include = []
+# container_state_exclude = []
+
+## Timeout for docker list, info, and stats commands
+timeout = "5s"
+
+## Whether to report for each container per-device blkio (8:0, 8:1...) and
+## network (eth0, eth1, ...) stats or not
+perdevice = true
+
+## Whether to report for each container total blkio and network stats or not
+total = false
+
+## Which environment variables should we use as a tag
+##tag_env = ["JAVA_HOME", "HEAP_SIZE"]
+
+## docker labels to include and exclude as tags.  Globs accepted.
+## Note that an empty array for both will include all labels as tags
+docker_label_include = []
+docker_label_exclude = []
+
+## Optional TLS Config
+# tls_ca = "/etc/telegraf/ca.pem"
+# tls_cert = "/etc/telegraf/cert.pem"
+# tls_key = "/etc/telegraf/key.pem"
+## Use TLS but skip chain & host verification
+# insecure_skip_verify = false
+
+
+# Read Nginx's basic status information (ngx_http_stub_status_module)
+#[[inputs.nginx]]
+# An array of Nginx stub_status URI to gather stats.
+#urls = ["http://localhost:8090/nginx_status"]
+## Optional TLS Config
+# tls_ca = "/etc/telegraf/ca.pem"
+# tls_cert = "/etc/telegraf/cert.cer"
+# tls_key = "/etc/telegraf/key.key"
+## Use TLS but skip chain & host verification
+# insecure_skip_verify = false
+# HTTP response timeout (default: 5s)
+# response_timeout = "5s"
+
+
+# # Read nginx_upstream_check module status information (https://github.com/yaoweibin/nginx_upstream_check_module)
+# [[inputs.nginx_upstream_check]]
+#   ## An URL where Nginx Upstream check module is enabled
+#   ## It should be set to return a JSON formatted response
+#   url = "http://127.0.0.1/status?format=json"
+#
+#   ## HTTP method
+#   # method = "GET"
+#
+#   ## Optional HTTP headers
+#   # headers = {"X-Special-Header" = "Special-Value"}
+#
+#   ## Override HTTP "Host" header
+#   # host_header = "check.example.com"
+#
+#   ## Timeout for HTTP requests
+#   timeout = "5s"
+#
+#   ## Optional HTTP Basic Auth credentials
+#   # username = "username"
+#   # password = "pa$$word"
+#
+#   ## Optional TLS Config
+#   # tls_ca = "/etc/telegraf/ca.pem"
+#   # tls_cert = "/etc/telegraf/cert.pem"
+#   # tls_key = "/etc/telegraf/key.pem"
+#   ## Use TLS but skip chain & host verification
+#   # insecure_skip_verify = false
+
+
+###############################################################################
+#                            SERVICE INPUT PLUGINS                            #
+###############################################################################
+
+
+# Read logging output from the Docker engine
+[[inputs.docker_log]]
+# Docker Endpoint
+#   To use TCP, set endpoint = "tcp://[ip]:[port]"
+#   To use environment variables (ie, docker-machine), set endpoint = "ENV"
+endpoint = "unix:///var/run/docker.sock"
+# When true, container logs are read from the beginning; otherwise
+# reading begins at the end of the log.
+from_beginning = false
+
+## Timeout for Docker API calls.
+# timeout = "5s"
+
+## Containers to include and exclude. Globs accepted.
+## Note that an empty array for both will include all containers
+# container_name_include = []
+# container_name_exclude = []
+
+## Container states to include and exclude. Globs accepted.
+## When empty only containers in the "running" state will be captured.
+# container_state_include = []
+# container_state_exclude = []
+
+## docker labels to include and exclude as tags.  Globs accepted.
+## Note that an empty array for both will include all labels as tags
+# docker_label_include = []
+# docker_label_exclude = []
+
+## Set the source tag for the metrics to the container ID hostname, eg first 12 chars
+source_tag = false
+
+## Optional TLS Config
+# tls_ca = "/etc/telegraf/ca.pem"
+# tls_cert = "/etc/telegraf/cert.pem"
+# tls_key = "/etc/telegraf/key.pem"
+## Use TLS but skip chain & host verification
+# insecure_skip_verify = false
+
+
+# # Read metrics from one or many postgresql servers
+# [[inputs.postgresql]]
+#   ## specify address via a url matching:
+#   ##   postgres://[pqgotest[:password]]@localhost[/dbname]\
+#   ##       ?sslmode=[disable|verify-ca|verify-full]
+#   ## or a simple string:
+#   ##   host=localhost user=pqotest password=... sslmode=... dbname=app_production
+#   ##
+#   ## All connection parameters are optional.
+#   ##
+#   ## Without the dbname parameter, the driver will default to a database
+#   ## with the same name as the user. This dbname is just for instantiating a
+#   ## connection with the server and doesn't restrict the databases we are trying
+#   ## to grab metrics for.
+#   ##
+#   address = "host=localhost user=postgres sslmode=disable"
+#   ## A custom name for the database that will be used as the "server" tag in the
+#   ## measurement output. If not specified, a default one generated from
+#   ## the connection address is used.
+#   # outputaddress = "db01"
+#
+#   ## connection configuration.
+#   ## maxlifetime - specify the maximum lifetime of a connection.
+#   ## default is forever (0s)
+#   max_lifetime = "0s"
+#
+#   ## A  list of databases to explicitly ignore.  If not specified, metrics for all
+#   ## databases are gathered.  Do NOT use with the 'databases' option.
+#   # ignored_databases = ["postgres", "template0", "template1"]
+#
+#   ## A list of databases to pull metrics about. If not specified, metrics for all
+#   ## databases are gathered.  Do NOT use with the 'ignored_databases' option.
+#   # databases = ["app_production", "testing"]