prometheus监控JMX需要向Tomcat内添加jmx_prometheus_javaagent包,并暴露一个端口给外部访问来获取数据。
本文采用0.3.1版本:jmx_prometheus_javaagent-0.3.1.jar
Tomcat采用docker部署,生产环境建议做成镜像用k8s启动。
1、前置准备
本文采用docker方式部署Tomcat
1.1、创建install_tomcat脚本
# cat install_tomcat.sh
docker run -d \
  --name tomcat-1 \
  -v /root/manifests/jvm/prom-jvm-demo:/jmx-exporter \
  -e CATALINA_OPTS="-Xms64m -Xmx128m -javaagent:/jmx-exporter/jmx_prometheus_javaagent-0.3.1.jar=6060:/jmx-exporter/simple-config.yml" \
  -p 6060:6060 \
  -p 8080:8080 \
  tomcat:latest
1.2、创建prometheus-serviceMonitorJvm.yaml,用于向kube-prometheus内添加serviceMonitor
# cat prometheus-serviceMonitorJvm.yaml 
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor 
metadata:
  name: jmx-metrics
  namespace: monitoring
  labels:
    k8s-apps: jmx-metrics
spec:
  jobLabel: metrics 
  selector:
    matchLabels:
      metrics: jmx-metrics # 根据label中有metrics: jmx-metrics 的service
  namespaceSelector:
    any: true     # 所有名称空间
  endpoints:
  - port: http-metrics # 拉取的端口和下边对应
    interval: 15s # 拉取metric的时间间隔
--- 
apiVersion: v1
kind: Service
metadata:
  annotations: 
    prometheus.io/scrapetrue: "true"
  labels:
    metrics: jmx-metrics   # 和上边一致
  name: kube-jmx
  namespace: monitoring
spec:
  ports:
  - name: http-metrics  
    port: 6060      # service的端口,供上边获取数据
    protocol: TCP
    targetPort: 6060  # 绑定宿主机的端口,也就是jmx_prometheus_javaagent的端口
#  selector:     # 因为是手动指定endpoint所以不能添加。否则过一段时间查看service的endpoint会为空
#    k8s-app: kube-jmx  # 如果是k8s部署的就需要写标签,去绑定tomcat pod的service
---
apiVersion: v1
kind: Endpoints
metadata:
  name: kube-jmx
  namespace: monitoring
subsets:
- addresses:
  - ip: 宿主机IP
  ports:
  - name: http-metrics
    port: 6060    # jmx_prometheus_javaagent的端口
    protocol: TCP
1.3、在当前目录创建prom-jvm-demo,下载jmx_prometheus_javaagent至prom-jvm-demo目录下
# ls
docker.yaml  prometheus-serviceMonitorJvm.yaml 
# mkdir prom-jvm-demo
# wget https://repo1.maven.org/maven2/io/prometheus/jmx/jmx_prometheus_javaagent/0.3.1/jmx_prometheus_javaagent-0.3.1.jar -O prom-jvm-demo/jmx_prometheus_javaagent-0.3.1.jar
1.4、进入prom-jvm-demo目录内,创建jmx_prometheus_javaagent的配置文件simple-config.yml
# cd prom-jvm-demo
# cat simple-config.yml 
---
lowercaseOutputLabelNames: true
lowercaseOutputName: true
whitelistObjectNames: ["java.lang:type=OperatingSystem"]
rules:
 - pattern: 'java.lang<type=OperatingSystem><>((?!process_cpu_time)\w+):'
   name: os_$1
   type: GAUGE
   attrNameSnakeCase: true
2、启动Tomcat
2.1、执行install_tomcat.sh
# sh install_tomcat.sh
2.2、测试访问6060端口
# curl 127.0.0.1:6060
# HELP jmx_config_reload_success_total Number of times configuration have successfully been reloaded.
# TYPE jmx_config_reload_success_total counter
jmx_config_reload_success_total 0.0
# HELP os_free_swap_space_size FreeSwapSpaceSize (java.lang<type=OperatingSystem><>FreeSwapSpaceSize)
# TYPE os_free_swap_space_size gauge
os_free_swap_space_size 0.0
# HELP os_free_physical_memory_size FreePhysicalMemorySize (java.lang<type=OperatingSystem><>FreePhysicalMemorySize)
# TYPE os_free_physical_memory_size gauge
os_free_physical_memory_size 3.68160768E8
# HELP os_max_file_descriptor_count MaxFileDescriptorCount (java.lang<type=OperatingSystem><>MaxFileDescriptorCount)
# TYPE os_max_file_descriptor_count gauge
os_max_file_descriptor_count 1048576.0
# HELP os_system_load_average SystemLoadAverage (java.lang<type=OperatingSystem><>SystemLoadAverage)
# TYPE os_system_load_average gauge
os_system_load_average 1.74
# HELP os_total_physical_memory_size TotalPhysicalMemorySize (java.lang<type=OperatingSystem><>TotalPhysicalMemorySize)
# TYPE os_total_physical_memory_size gauge
os_total_physical_memory_size 3.974213632E9
# HELP os_committed_virtual_memory_size CommittedVirtualMemorySize (java.lang<type=OperatingSystem><>CommittedVirtualMemorySize)
# TYPE os_committed_virtual_memory_size gauge
os_committed_virtual_memory_size 3.71601408E9
# HELP os_system_cpu_load SystemCpuLoad (java.lang<type=OperatingSystem><>SystemCpuLoad)
# TYPE os_system_cpu_load gauge
os_system_cpu_load 0.10213187902825979
# HELP os_available_processors AvailableProcessors (java.lang<type=OperatingSystem><>AvailableProcessors)
# TYPE os_available_processors gauge
os_available_processors 4.0
# HELP os_process_cpu_load ProcessCpuLoad (java.lang<type=OperatingSystem><>ProcessCpuLoad)
# TYPE os_process_cpu_load gauge
os_process_cpu_load 0.0
......省略
3、创建serviceMonitor
3.1、apply prometheus-serviceMonitorJvm.yaml
# kubectl apply -f prometheus-serviceMonitorJvm.yaml
3.2、查看创建的serviceMonitorJvm、service、endpoint
# kubectl get servicemonitors,svc,endpoints -n monitoring  | grep jmx
servicemonitor.monitoring.coreos.com/jmx-metrics               4d20h
service/kube-jmx                ClusterIP   10.0.0.157   <none>        6060/TCP            4d20h
endpoints/kube-jmx                xxx.xx.x.xxx:6060                         4d20h
4、查看prometheus WEB
4.1、访问prometheus web页面
5、添加grafana展示
5.1、在grafana内添加dashboards:8878


6、添加告警规则
6.1、编写prometheus-rules-add-jvm.yaml,也可在prometheus-rules.yaml内追加
# cat prometheus-rules-add-jvm.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus: k8s
    role: alert-rules
  name: jvm-metrics-rules
  namespace: monitoring
spec:
  groups:
  - name: jvm-metrics-rules
    rules:
    # 在5分钟里,GC花费时间超过10%
    - alert: GcTimeTooMuch
      expr: increase(jvm_gc_collection_seconds_sum[5m]) > 30
      for: 5m
      labels:
        severity: red
      annotations:
        summary: "{{ $labels.app }} GC时间占比超过10%"
        message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} GC时间占比超过10%,当前值({{ $value }}%)"
    # GC次数太多
    - alert: GcCountTooMuch
      expr: increase(jvm_gc_collection_seconds_count[1m]) > 30
      for: 1m
      labels:
        severity: red
      annotations:
        summary: "{{ $labels.app }} 1分钟GC次数>30次"
        message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 1分钟GC次数>30次,当前值({{ $value }})"
    # FGC次数太多
    - alert: FgcCountTooMuch
      expr: increase(jvm_gc_collection_seconds_count{gc="ConcurrentMarkSweep"}[1h]) > 3
      for: 1m
      labels:
        severity: red
      annotations:
        summary: "{{ $labels.app }} 1小时的FGC次数>3次"
        message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 1小时的FGC次数>3次,当前值({{ $value }})"
    # 非堆内存使用超过80%
    - alert: NonheapUsageTooMuch
      expr: jvm_memory_bytes_used{job="jmx-metrics", area="nonheap"} / jvm_memory_bytes_max * 100 > 80
      for: 5m
      labels:
        severity: red
      annotations:
        summary: "{{ $labels.app }} 非堆内存使用>80%"
        message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 非堆内存使用率>80%,当前值({{ $value }}%)"
    # 内存使用预警
    - alert: HeighMemUsage
      expr: process_resident_memory_bytes{job="jmx-metrics"} / os_total_physical_memory_bytes * 100 > 85
      for: 5m
      labels:
        severity: red
      annotations:
        summary: "{{ $labels.app }} rss内存使用率大于85%"
        message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} rss内存使用率大于85%,当前值({{ $value }}%)"
    # 堆内存使用超过85%
    - alert: heapUsageTooMuch
      expr: jvm_memory_bytes_used{area="heap"} / jvm_memory_bytes_max * 100 > 95
      for: 5m
      labels:
        severity: red
      annotations:
        summary: "{{ $labels.app }} 堆内存使用>85%"
        message: "ns:{{ $labels.namespace }} pod:{{ $labels.pod }} ip:{{ $labels.instance }} 堆内存使用率>85%,当前值({{ $value }}%)"
6.2、执行并在prometheus Alerts内查看
# kubectl apply -f prometheus-rules-add-jvm.yaml

