1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
|
import json
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.monitor.v20180724 import monitor_client, models
import notify_mem
import logging
import sys
import time
import os
import pickle
LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
DATE_FORMAT = "%m/%d/%YT%H:%M:%S"
logging.basicConfig(filename='tke_pod_mem.log', level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
workload_name = ["dingtalk-webhook",
"logstash"]
# 定义拉取的时间点
def pull_time() -> str:
minute = int(time.strftime("%M")) - int(sys.argv[4])
min_time = time.strftime("%Y-%m-%dT%H:") + str(minute) + ':00+08:00'
return min_time
# 获取数据
def main(get_time: str) -> list:
try:
cred = credential.Credential("", "")
httpProfile = HttpProfile()
httpProfile.endpoint = "monitor.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
client = monitor_client.MonitorClient(cred, "", clientProfile)
req = models.DescribeStatisticDataRequest()
params = {
"Module": "monitor",
"Namespace": "QCE/TKE",
"MetricNames": ["K8sPodRateMemNoCacheLimit"],
"Conditions": [
{
"Key": "tke_cluster_instance_id",
"Operator": "=",
"Value": [""]
},
{
"Key": "workload_name",
"Operator": "in",
"Value": workload_name
}
],
"Period": 60,
"StartTime": get_time,
"EndTime": get_time
}
req.from_json_string(json.dumps(params))
resp = client.DescribeStatisticData(req)
res = resp.to_json_string()
dimensions_values = json.loads(res)['Data'][0]['Points']
print(dimensions_values)
return dimensions_values
except TencentCloudSDKException as err:
logging.info(err)
# 数据处理
def handler(data_lists: list, get_time: str) -> None:
n = 0
for data_list in data_lists:
# 控制发送告警数量,用于限流
if n <= int(sys.argv[3]):
dimensions = data_list['Dimensions']
value = data_list['Values'][0]['Value']
# 无监控数据的过滤掉
if value is not None:
if float(value) > int(sys.argv[5]):
for pod_name in dimensions:
if 'pod_name' == pod_name['Name']:
pod_name_id = pod_name['Value']
if pod_name_id not in str(sys.argv[6]):
if pod_name_id not in dict_tmp:
dict_tmp[pod_name['Value']] = 0
# 先判断是否发送过告警
if dict_tmp[pod_name_id] == 0:
notify_mem.notify(pod_name['Value'], value, sys.argv[1], get_time, sys.argv[5])
n += 1
dict_tmp[pod_name_id] += 1
logging.info(dict_tmp)
elif dict_tmp[pod_name_id] == int(sys.argv[2]):
dict_tmp[pod_name_id] = 0
logging.info(dict_tmp)
else:
dict_tmp[pod_name_id] += 1
logging.info(dict_tmp)
logging.info(f'{pod_name_id} values: {value}')
else:
logging.info(f'white pod_name: {pod_name_id}')
else:
for pod_name in dimensions:
if 'pod_name' == pod_name['Name']:
pod_name_id = pod_name['Value']
if pod_name_id not in str(sys.argv[6]):
if pod_name_id in dict_tmp and dict_tmp[pod_name_id] > 0:
notify_mem.resolve(pod_name_id, value, sys.argv[1], get_time, sys.argv[5])
dict_tmp[pod_name_id] = 0
logging.info(f'resolved.')
if __name__ == '__main__':
'''
argv1: dingding_url
argv2: alert internal(seconds)
argv3: 每分钟告警数量
argv4: 获取几分钟前的数据,比如2
argv5: 告警阈值
argv6: 白名单
'''
if not os.path.exists('tmp.dict'):
with open('tmp.dict', 'wb') as f:
dict_tmp = {}
pickle.dump(dict_tmp, f)
with open('tmp.dict', 'rb') as f1:
dict_tmp = pickle.load(f1)
pull_time = pull_time()
for i in range(5):
get_data = main(pull_time)
if get_data:
handler(get_data, pull_time)
logging.info('Success.')
break
else:
logging.info('time sleep 10.')
time.sleep(10)
if i == 4:
logging.info('Failed. No value.')
with open('tmp.dict', 'wb') as f2:
pickle.dump(dict_tmp, f2)
logging.info('done.')
|