summaryrefslogtreecommitdiff
path: root/util/temp_metrics.conf
blob: 394425c1f71756eca5eaac1f84a2eedc773d3f18 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
# Copyright (c) 2012 The Chromium OS Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

description     "Temporary, quick-hack metrics collection & thermal daemon"
author          "chromium-os-dev@chromium.org"

# This is for quickly adding UMA stats that we may need for
# short-term experiments, when we don't have the time to add
# stuff to metrics_daemon.  That's where it should go in the
# long term.
#
# This is also currently doing a userland thermal loop to allow
# for quick experimentation.  This thermal loop will eventually
# move to the BIOS once the data from experiments help prove its
# efficacy.

start on started system-services
stop on stopping system-services
respawn

script
  TEMP_OFFSET=273  # difference between K (reported by EC) and C (used in UMA)

  # Thermal loop fields
  CPU_MAX_FREQ_FIELD=1
  CPU_MIN_FREQ_FIELD=2
  GPU_MAX_FREQ_FIELD=3
  CPU_DUTY_CYCLE_FIELD=4
  PKG_POWER_LIMIT_FIELD=5

  # Thermal loop steps
  all_steps="
  1801000 800000 1150 0 0x180aa00dd8088 # no throttling
  1801000 800000 1150 0 0x180aa00dd8080 # cap pkg to 16W
  1801000 800000 1150 0 0x180aa00dd8078 # cap pkg to 15W
  1801000 800000 1150 0 0x180aa00dd8070 # cap pkg to 14W
  1801000 800000 1150 0 0x180aa00dd8068 # cap pkg to 13W
  1800000 800000 900 0 0x180aa00dd8068 # disable turbo
  1600000 800000 800 0 0x180aa00dd8068 # cap CPU & GPU frequency
  1400000 800000 700 0 0x180aa00dd8068 # cap CPU & GPU frequency
  1200000 800000 600 0 0x180aa00dd8068 # cap CPU & GPU frequency
  1000000 800000 500 0 0x180aa00dd8068 # cap CPU & GPU frequency
  800000 800000 400 0 0x180aa00dd8068 # cap CPU & GPU frequency
  800000 800000 350 0 0x180aa00dd8068 # cap CPU & GPU frequency
  800000 800000 350 0x1c 0x180aa00dd8068 # duty cycle CPU
  800000 800000 350 0x18 0x180aa00dd8068 # duty cycle CPU
  "
  max_steps=$(($(echo "$all_steps" | wc -l) - 3))

  get_step() {
    row=$(($1 + 2))
    out=$(echo "$all_steps" | awk "{if (NR==$row) print}")
    echo "$out"
  }

  get_field() {
    out=$(echo "$2" | awk "{print \$$1}")
    echo $out
  }

  get_peci_temp() {
    tempk=$(ectool temps 9 | sed 's/[^0-9]//g')
    tempc=$((tempk - $TEMP_OFFSET))
    echo $tempc
  }

  get_sensor_temp() {
    s=$1
    tempc=0
    if out=$(ectool temps $s); then
       tempk=$(echo $out | sed 's/[^0-9]//g')
       tempc=$((tempk - $TEMP_OFFSET))
    fi
    echo $tempc
  }

  get_sensor_list() {
    # USB C-Object: 1 or 13
    # PCH D-Object: 3
    # Hinge C-Object: 5 or 15
    # Charger D-Object: 7
    if ectool tempsinfo 1 | grep -q "USB C-Object"; then
      usb_c_object=1
    else
      usb_c_object=13
    fi
    charger_d_object=7
    echo $usb_c_object $charger_d_object
  }

  set_calibration_data() {
    B0='-2.94e-5'
    B1='-5.7e-7'
    B2='4.63e-9'

    USB_C_S0='2.712e-14'
    PCH_D_S0='9.301e-14'
    HINGE_C_S0='-11.000e-14'
    CHARGER_D_S0='5.141e-14'

    # Note that the sensor numbering is different between the ectool tmp006
    # and temps/tempsinfo commands.
    USB_C="0 $USB_C_S0 $B0 $B1 $B2"
    PCH_D="1 $PCH_D_S0 $B0 $B1 $B2"
    HINGE_C="2 $HINGE_C_S0 $B0 $B1 $B2"
    CHARGER_D="3 $CHARGER_D_S0 $B0 $B1 $B2"

    for i in "$USB_C" "$PCH_D" "$HINGE_C" "$CHARGER_D"; do
      ectool tmp006cal $i
    done
  }

  max_skin_temp=0
  sensor_temperatures=

  get_max_skin_temp() {
    sensor_temperatures=
    max_skin_temp=0
    for i in $*; do
      t=$(get_sensor_temp $i)
      sensor_temperatures=$sensor_temperatures$i:$t:
      if [ $t -gt $max_skin_temp ]; then
        max_skin_temp=$t
      fi
    done

    # Record the PECI CPU temperature also.
    i=9
    t=$(get_sensor_temp $i)
    sensor_temperatures=$sensor_temperatures$i:$t:
  }

  set_cpu_freq() {
    max_freq=$1
    min_freq=$2
    for cpu in /sys/devices/system/cpu/cpu?/cpufreq; do
      echo 800000 > $cpu/scaling_min_freq
      echo 800000 > $cpu/scaling_max_freq
      echo $max_freq > $cpu/scaling_max_freq
      echo $min_freq > $cpu/scaling_min_freq
    done
  }

  set_gpu_min_freq() {
    GPU_MIN_FREQ=450
    echo $GPU_MIN_FREQ > /sys/kernel/debug/dri/0/i915_min_freq
  }

  set_gpu_max_freq() {
    gpu_max_freq=$1
    if [ $GPU_MIN_FREQ -gt $gpu_max_freq ]; then
      gpu_max_freq=$GPU_MIN_FREQ
    fi
    echo $gpu_max_freq > /sys/kernel/debug/dri/0/i915_max_freq
  }

  set_duty_cycle() {
    duty_cycle=$1
    for i in 0 1 2 3; do
      wrmsr $i 0x19a $duty_cycle
    done
  }

  set_pkg_power_limit() {
    pwr_limit=$1
    wrmsr 0 0x610 $pwr_limit
  }

  log_message() {
    logger -t temp_metrics "$*"
  }

  TEMP_THRESHOLD_1=38
  TEMP_THRESHOLD_1_WM=40
  TEMP_THRESHOLD_2=45
  TEMP_THRESHOLD_2_WM=47
  TEMP_THRESHOLD_3=50
  TEMP_THRESHOLD_3_WM=50

  TEMP_THRESHOLD_0_MIN_STEP=0
  TEMP_THRESHOLD_0_MAX_STEP=0
  TEMP_THRESHOLD_1_MIN_STEP=1
  TEMP_THRESHOLD_1_MAX_STEP=5
  TEMP_THRESHOLD_2_MIN_STEP=6
  TEMP_THRESHOLD_2_MAX_STEP=9
  TEMP_THRESHOLD_3_MIN_STEP=10
  TEMP_THRESHOLD_3_MAX_STEP=13

  current_step=1
  new_step=0

  thermal_loop() {
    # Hack to reset turbo activation threshold since BIOS can change it
    # underneath us.
    wrmsr 0 0x64c 0x12

    skin_temp=$1
    if [ $skin_temp -gt $TEMP_THRESHOLD_3 ]; then
      temp_watermark=$TEMP_THRESHOLD_3_WM
      min_step=$TEMP_THRESHOLD_3_MIN_STEP
      max_step=$TEMP_THRESHOLD_3_MAX_STEP
    elif [ $skin_temp -gt $TEMP_THRESHOLD_2 ]; then
      temp_watermark=$TEMP_THRESHOLD_2_WM
      min_step=$TEMP_THRESHOLD_2_MIN_STEP
      max_step=$TEMP_THRESHOLD_2_MAX_STEP
    elif [ $skin_temp -gt $TEMP_THRESHOLD_1 ]; then
      temp_watermark=$TEMP_THRESHOLD_1_WM
      min_step=$TEMP_THRESHOLD_1_MIN_STEP
      max_step=$TEMP_THRESHOLD_1_MAX_STEP
    else
      temp_watermark=0
      min_step=$TEMP_THRESHOLD_0_MIN_STEP
      max_step=$TEMP_THRESHOLD_0_MAX_STEP
    fi

    if [ $skin_temp -gt $temp_watermark ]; then
      if [ $current_step -ne $max_step ]; then
        new_step=$(($current_step + 1))
      fi
    elif [ $skin_temp -lt $temp_watermark ]; then
      if [ $current_step -gt $min_step ]; then
        new_step=$(($current_step - 1))
      fi
    else
      new_step=$current_step
    fi

    if [ $new_step -gt $max_step ]; then
        new_step=$max_step
    elif [ $new_step -lt $min_step ]; then
        new_step=$min_step
    fi

    if  [ $new_step -eq $current_step ]; then
      return
    fi

    current_step=$new_step
    step=$(get_step $new_step)

    log_message "Throttling (temps: $sensor_temperatures):" $step

    cpu_max_freq=$(get_field $CPU_MAX_FREQ_FIELD "$step")
    cpu_min_freq=$(get_field $CPU_MIN_FREQ_FIELD "$step")
    gpu_max_freq=$(get_field $GPU_MAX_FREQ_FIELD "$step")
    cpu_duty_cycle=$(get_field $CPU_DUTY_CYCLE_FIELD "$step")
    pkg_power_limit=$(get_field $PKG_POWER_LIMIT_FIELD "$step")

    set_cpu_freq $cpu_max_freq $cpu_min_freq
    set_gpu_max_freq $gpu_max_freq
    set_duty_cycle $cpu_duty_cycle
    set_pkg_power_limit $pkg_power_limit
  }

  get_fan_rpm() {
    echo $(ectool pwmgetfanrpm | sed 's/[^0-9]//g')
  }

  set_fan_rpm() {
    ectool pwmsetfanrpm $1
  }

  reset_fan_thresholds() {
    temp_low1=105
    temp_low2=105
    temp_low3=105
    temp_low4=105
    temp_low5=105
    temp_low6=105
  }

  last_rpm=10
  temp_low1=105
  temp_low2=105
  temp_low3=105
  temp_low4=105
  temp_low5=105
  temp_low6=105

  fan_loop() {
    skin_temp=$1

    if [ $skin_temp -gt 48 ] || [ $skin_temp -gt $temp_low1 ]; then
      rpm=9300
      reset_fan_thresholds
      temp_low1=46
    elif [ $skin_temp -gt 44 ] || [ $skin_temp -gt $temp_low2 ]; then
      rpm=8000
      reset_fan_thresholds
      temp_low2=43
    elif [ $skin_temp -gt 42 ] || [ $skin_temp -gt $temp_low3 ]; then
      rpm=7000
      reset_fan_thresholds
      temp_low3=41
    elif [ $skin_temp -gt 40 ] || [ $skin_temp -gt $temp_low4 ]; then
      rpm=5500
      reset_fan_thresholds
      temp_low4=39
    elif [ $skin_temp -gt 38 ] || [ $skin_temp -gt $temp_low5 ]; then
      rpm=4000
      reset_fan_thresholds
      temp_low5=34
    elif [ $skin_temp -gt 33 ] || [ $skin_temp -gt $temp_low6 ]; then
      rpm=3000
      reset_fan_thresholds
      temp_low6=30
    else
      rpm=0
      reset_fan_thresholds
    fi

    # During S0->S3->S0 transitions, the EC sets the fan RPM to 0. This script
    # isn't aware of such transitions. Read the current fan RPM again to see
    # if it got set to 0. Note that comparing the current fan RPM against last
    # requested RPM won't suffice since the actual fan RPM may not be exactly
    # what was requested.
    cur_rpm=$(get_fan_rpm)
    if ([ $cur_rpm -ne 0 ] && [ $last_rpm -eq $rpm ]) || \
       ([ $cur_rpm -eq 0 ] && [ $rpm -eq 0 ]); then
      last_rpm=$rpm
      return
    fi

    log_message "Setting fan RPM (temps: $sensor_temperatures): $last_rpm -> $rpm"

    last_rpm=$rpm
    set_fan_rpm $rpm
  }

  # Thermal zone 1 is for operating systems where a userland thermal loop
  # doesn't exist. Disable it.
  if [ -e /sys/class/thermal/thermal_zone1/mode ]; then
    echo -n 'disabled' > /sys/class/thermal/thermal_zone1/mode
  fi

  # Get list of sensors to monitor.
  sensor_list=$(get_sensor_list)

  # Set sensor calibration data.
  set_calibration_data

  # Set minimum GPU frequency.
  set_gpu_min_freq

  loop_count=0
  ec_fan_loop=0

  while true; do
    sleep 10
    loop_count=$(($loop_count + 1))

    # Read the max skin temperature.
    get_max_skin_temp $sensor_list

    if [ $max_skin_temp -eq 0 ]; then
      if [ $ec_fan_loop -eq 0 ]; then
        log_message "Invalid max skin temp. Switching to EC fan loop."
        ectool autofanctrl
        ec_fan_loop=1
        last_rpm=10
      fi
    else
      # Run the fan loop.
      fan_loop $max_skin_temp
      ec_fan_loop=0

      # Run the thermal loop.
      thermal_loop $max_skin_temp
    fi

    # Report the metrics once every 30 seconds.
    if [ $loop_count -lt 3 ]; then
      continue
    fi
    loop_count=0

    ectool temps all | while read line; do
      index=$(printf "%02d" "${line%%:*}")
      tempk="${line##* }"
      tempc=$(($tempk - $TEMP_OFFSET))
      # ignore values below freezing
      if [ $tempc -lt 0 ]; then
        tempc=0
      fi
      # Use a linear histogram with 1 C buckets starting at 0.
      N_SLOTS=180
      metrics_client -e Platform.Temperature.Sensor$index $tempc $N_SLOTS
    done
  done
end script