Nagios Check NRPE : Check Mining RIG Temp

Prerequisites: Mining RIG using ethOS + a Nagios server already setup and Nagios-NRPE-server installed on your ethOS mining rig. ( this way for help regarding this: Installing Nagios-Nrpe-Server on ethOS )

 

This procedure will help you fetching the GPU temperature data from your ethOS RIG and alert you if you are loosing cards in your mining RIG. Threshold to critical Temps is set to 80 degrees C.

This script has to be set in your Mining RIG, then called by your Nagios Server remotely using NRPE.

 

Let’s set it up in your nrpe.cfg file ( usually in /etc/nagios/nrpe.cfg )

1
2
# RIG
command[check_rig-hash]=python /usr/lib/nagios/plugins/check_rig-temp

 

And the check itself to place in /usr/lib/nagios/plugins/check_rig-temp

Do not forget to edit your panel URL :

  gpuJsonSite = “http://XXXXX.ethosdistro.com/?json=yes”

by replacing the XXXXX with your ID [http://XXXXX.ethosdistro.com/?json=yes]

(you can find your ID by typing helpme on your rig directly in command line.)

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/python
#*****************************************************************
# Author: David Bayle
# Contact: contact@davidbayle.com
# This python scripts cheks your ethOS mining rig Temp for Nagios Monitoring
#


import os
import sys
import time
import datetime
import json
import commands

from urllib import urlopen


STATE_OK = 0
STATE_WARNING = 1
STATE_CRITICAL = 2
STATE_UNKNOWN = 3

RETURN_STATE = STATE_OK

gpuRigName = os.uname()[1]
gpuJsonSite = "http://tenshi.ethosdistro.com/?json=yes"
gpuNotHashing = 0
gpuTemp = 0
gpuMaxTemp = 79


# ================================   functions  =============================
def PrintOutput(dumpStr):
  print dumpStr

def return_state(state):
  global RETURN_STATE
  RETURN_STATE = state


# ============================== process arguments ============================
def ProcessArguments(gotPanelInfo):
  # arg#0: rig name (required if "/var/run/ethos/stats.file" not available)
  # arg#1: json site (required if "/var/run/ethos/url.file" not available)
  global gpuRigName, gpuJsonSite

  if (gotPanelInfo != 1):
    PrintOutput("Taking rig name and panel url from arguments")

  argStr = ""

  argIdx = 0
  argProcessed = 0
  while (1):
    argIdx += 1
    if (argIdx >= len(sys.argv)):
      break

    arg = sys.argv[argIdx]

    if (gotPanelInfo == 1):
      PrintOutput("Ignoring argument : " + str(arg))
      continue

    argProcessed += 1
    if (argProcessed == 1):
      gpuRigName = arg
    elif(argProcessed == 2):
      gpuJsonSite = arg


def GetPanelInfo():
  global gpuRigName, gpuJsonSite

  commandOutput = commands.getstatusoutput('\grep http /var/run/ethos/url.file')
  if (commandOutput[0] != 0):
    PrintOutput("UNKNOWN - /var/run/ethos/url.file is not availble")
    return_state(STATE_UNKNOWN)

  gpuJsonSite = commandOutput[1]
  gpuJsonSite = gpuJsonSite+"/?json=yes"

  commandOutput = commands.getstatusoutput("\grep hostname /var/run/ethos/stats.file")
  if (commandOutput[0] != 0):
    PrintOutput("UNKNOWN - /var/run/ethos/stats.file is not avaible")
    return_state(STATE_UNKNOWN)

  gpuRigName = commandOutput[1][9:]

  return 1



# ===================================   run  ================================
success = GetPanelInfo()
ProcessArguments(success)

try:
    url = urlopen(gpuJsonSite).read()
except:
    PrintOutput("UNKNOWN - Invalid URL")
    return_state(STATE_UNKNOWN)

  # convert site content to json
try:
    result = json.loads(url)
except:
    PrintOutput("UNKNOWN - Invalid Json")
    return_state(STATE_UNKNOWN)

  # extract data
try:
    numGpus = result["rigs"][gpuRigName]["gpus"]
    numRunningGpus = result["rigs"][gpuRigName]["miner_instance"]
    hashRate =  result["rigs"][gpuRigName]["miner_hashes"]
    tempGpus = result["rigs"][gpuRigName]["temp"]
    tempList = tempGpus.split()
    status = result["rigs"][gpuRigName]["condition"]
except:
    PrintOutput("UNKNOWN - Invalid RIG Name")
    return_state(STATE_UNKNOWN)


if (status == "unreachable"):
    gpuNotHashing = 0
    PrintOutput("WARNING: Panel is not updating !")
    return_state(STATE_WARNING)

  # check if any gpu is down
if (int(numRunningGpus) != int(numGpus)):
  if (gpuNotHashing == 1):
      # reboot
      PrintOutput("CRITICAL: YOU SHOULD REBOOT !!! (" + str(hashRate) + ")")
      return_state(STATE_CRITICAL)
      sys.exit(STATE_CRITICAL)
      #os.system("sudo reboot")
  else:
      # wait for another 2 min before rebooting
      PrintOutput("WARNING: One or more Gpu(s) might have crashed !!")
      gpuNotHashing = 1
      return_state(STATE_WARNING)
      sys.exit(STATE_WARNING)
else:
    # reset reboot pending counter
  for gpuTemp in tempList:
    if (float(gpuTemp) >= gpuMaxTemp):
       PrintOutput("CRITICAL: " + status.upper() + " GPUs: " + str(numRunningGpus) + "/" + str(numGpus) + " - TEMPERATURE EXCEEDED ON ONE GPU: " + str(gpuTemp) + " (C)")
       return_state(STATE_CRITICAL)
       sys.exit(STATE_CRITICAL)
    else:
      PrintOutput(status.upper() + " OK :: GPUs: " + str(numRunningGpus) + "/" + str(numGpus) + " - TEMPERATURES ARE OK: " + str(tempGpus) + " (C)")
      gpuNotHashing = 0
      return_state(STATE_OK)
      sys.exit(STATE_OK)



sys.exit(RETURN_STATE)

 

Which will produce for example :

 

1
2
# python /usr/lib/nagios/plugins/check_rig-temp
MINING OK :: GPUs: 5/5 - TEMPERATURES ARE OK: 51 56 55 53 59 (C)