summaryrefslogtreecommitdiff
path: root/erts/emulator/sys/unix/erl_child_setup.c
blob: 8f261761db2c5f93196a6679365f16776b3ef005 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
/*
 * %CopyrightBegin%
 * 
 * Copyright Ericsson AB 2002-2018. All Rights Reserved.
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * %CopyrightEnd%
 */

/*
 * This program is started at erts startup and all fork's that
 * have to be done are done in here. This is done for a couple
 * of reasons:
 *  - Allow usage of fork without a memory explosion.
 *  -- we do not want to use vfork, as it blocks the VM
 *     until the execv is done, and if the program that
 *     is to be executed is on an NFS that is unavailable,
 *     the execv can block for a very long time.
 *  -- we cannot do fork inside the VM as that would temporarily
 *     duplicate the memory usage of the VM per parallel exec.
 *
 * Some implementation notes:
 *  - A single Unix Domain Socket is setup in between the VM and
 *    this program. Over that UDS the file descriptors that should
 *    be used to talk to the child program are sent.
 *    The actual command to execute, together with options and the
 *    environment, is sent over the pipe represented by the
 *    file descriptors mentioned above. We don't send the
 *    command over the UDS as that would increase the likely hood
 *    that it's buffer would be full.
 *
 *  - Since it is this program that execv's, it has to take care of
 *    all the SIGCHLD signals that the child programs generate. The
 *    signals are received and the pid+exit reason is sent as data
 *    on the UDS to the VM. The VM is then able to map the pid to the
 *    port of the child program that just exited and deliver the status
 *    code if requested.
 */

#ifdef HAVE_CONFIG_H
#  include "config.h"
#endif

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/socket.h>

#define WANT_NONBLOCKING

#include "erl_driver.h"
#include "sys_uds.h"
#include "erl_term.h"
#include "erl_child_setup.h"

#undef ERTS_GLB_INLINE_INCL_FUNC_DEF
#define ERTS_GLB_INLINE_INCL_FUNC_DEF 1
#include "hash.h"

#define SET_CLOEXEC(fd) fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC)

#if defined(__ANDROID__)
#define SHELL "/system/bin/sh"
#else
#define SHELL "/bin/sh"
#endif /* __ANDROID__ */

#if !defined(MSG_DONTWAIT) && defined(MSG_NONBLOCK)
#define MSG_DONTWAIT MSG_NONBLOCK
#endif

//#define HARD_DEBUG
#ifdef HARD_DEBUG
#define DEBUG_PRINT(fmt, ...) fprintf(stderr, "%d:" fmt "\r\n", getpid(), ##__VA_ARGS__)
#else
#define DEBUG_PRINT(fmt, ...)
#endif

static char abort_reason[200]; /* for core dump inspection */

static void ABORT(const char* fmt, ...)
{
    va_list arglist;
    va_start(arglist, fmt);
    vsprintf(abort_reason, fmt, arglist);
    fprintf(stderr, "erl_child_setup: %s\r\n", abort_reason);
    va_end(arglist);
    abort();
}

#ifdef DEBUG
void
erl_assert_error(const char* expr, const char* func, const char* file, int line)
{
    fflush(stdout);
    fprintf(stderr, "%s:%d:%s() Assertion failed: %s\n",
            file, line, func, expr);
    fflush(stderr);
    abort();
}
#endif

void sys_sigblock(int sig)
{
    sigset_t mask;

    sigemptyset(&mask);
    sigaddset(&mask, sig);
    sigprocmask(SIG_BLOCK, &mask, (sigset_t *)NULL);
}

void sys_sigrelease(int sig)
{
    sigset_t mask;
    sigemptyset(&mask);
    sigaddset(&mask, sig);
    sigprocmask(SIG_UNBLOCK, &mask, (sigset_t *)NULL);
}

static void add_os_pid_to_port_id_mapping(Eterm, pid_t);
static Eterm get_port_id(pid_t);
static int forker_hash_init(void);

static int max_files = -1;
static int sigchld_pipe[2];

static int
start_new_child(int pipes[])
{
    struct sigaction sa;
    int errln = -1;
    int size, res, i, pos = 0;
    char *buff, *o_buff;

    char *cmd, *cwd, *wd, **new_environ, **args = NULL;

    Sint32 cnt, flags;

    /* only child executes here */

    /* Restore default handling of sigterm... */
    sa.sa_handler = SIG_DFL;
    sigemptyset(&sa.sa_mask);
    sa.sa_flags = 0;

    if (sigaction(SIGTERM, &sa, 0) == -1) {
        perror(NULL);
        exit(1);
    }
    
    do {
        res = read(pipes[0], (char*)&size, sizeof(size));
    } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));

    if (res <= 0) {
        errln = __LINE__;
        goto child_error;
    }

    buff = malloc(size);

    DEBUG_PRINT("size = %d", size);

    do {
        if ((res = read(pipes[0], buff + pos, size - pos)) < 0) {
            if (errno == ERRNO_BLOCK || errno == EINTR)
                continue;
            errln = __LINE__;
            goto child_error;
        }
        if (res == 0) {
            errno = EPIPE;
            errln = __LINE__;
            goto child_error;
        }
        pos += res;
    } while(size - pos != 0);

    o_buff = buff;

    flags = get_int32(buff);
    buff += sizeof(flags);

    DEBUG_PRINT("flags = %d", flags);

    cmd = buff;
    buff += strlen(buff) + 1;

    cwd = buff;
    buff += strlen(buff) + 1;

    if (*buff == '\0') {
        wd = NULL;
    } else {
        wd = buff;
        buff += strlen(buff) + 1;
    }
    buff++;

    DEBUG_PRINT("wd = %s", wd);

    cnt = get_int32(buff);
    buff += sizeof(cnt);
    new_environ = malloc(sizeof(char*)*(cnt + 1));

    DEBUG_PRINT("env_len = %d", cnt);
    for (i = 0; i < cnt; i++, buff++) {
        new_environ[i] = buff;
        while(*buff != '\0') buff++;
    }
    new_environ[cnt] = NULL;

    if (o_buff + size != buff) {
        /* This is a spawn executable call */
        cnt = get_int32(buff);
        buff += sizeof(cnt);
        args = malloc(sizeof(char*)*(cnt + 1));
        for (i = 0; i < cnt; i++, buff++) {
            args[i] = buff;
            while(*buff != '\0') buff++;
        }
        args[cnt] = NULL;
    }

    if (o_buff + size != buff) {
        errno = EINVAL;
        errln = __LINE__;
        fprintf(stderr,"erl_child_setup: failed with protocol "
                "error %d on line %d", errno, errln);
        /* we abort here as it is most likely a symptom of an
           emulator/erl_child_setup bug */
        abort();
    }

    DEBUG_PRINT("read ack");
    do {
        ErtsSysForkerProto proto;
        res = read(pipes[0], &proto, sizeof(proto));
        if (res > 0) {
            ASSERT(proto.action == ErtsSysForkerProtoAction_Ack);
            ASSERT(res == sizeof(proto));
        }
    } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK));

    if (res < 1) {
        errno = EPIPE;
        errln = __LINE__;
        goto child_error;
    }

    DEBUG_PRINT("Set cwd to: '%s'",cwd);

    if (chdir(cwd) < 0) {
        /* This is not good, it probably means that the cwd of
           beam is invalid. We ignore it and try anyways as
           the child might now need a cwd or the chdir below
           could take us to a valid directory.
        */
    }

    DEBUG_PRINT("Set wd to: '%s'",wd);

    if (wd && chdir(wd) < 0) {
        int err = errno;
        fprintf(stderr,"spawn: Could not cd to %s\r\n", wd);
        _exit(err);
    }

    DEBUG_PRINT("Do that forking business: '%s'",cmd);

    /* When the dup2'ing below is done, only
       fd's 0, 1, 2 and maybe 3, 4 should survive the
       exec. All other fds (i.e. the unix domain sockets
       and stray pipe ends) should have CLOEXEC set on them
       so they will be closed when the exec happens */
    if (flags & FORKER_FLAG_USE_STDIO) {
        /* stdin for process */
        if (flags & FORKER_FLAG_DO_WRITE &&
            dup2(pipes[0], 0) < 0) {
            errln = __LINE__;
            goto child_error;
        }
        /* stdout for process */
        if (flags & FORKER_FLAG_DO_READ &&
            dup2(pipes[1], 1) < 0) {
            errln = __LINE__;
            goto child_error;
        }
    }
    else {	/* XXX will fail if pipes[0] == 4 (unlikely..) */
        if (flags & FORKER_FLAG_DO_READ && dup2(pipes[1], 4) < 0) {
            errln = __LINE__;
            goto child_error;
        }
        if (flags & FORKER_FLAG_DO_WRITE && dup2(pipes[0], 3) < 0) {
            errln = __LINE__;
            goto child_error;
        }
    }

    /* we do the dup2 of stderr last so that errors
       in child_error will be printed to stderr */
    if (dup2(pipes[2], 2) < 0) {
        errln = __LINE__;
        goto child_error;
    }

#if defined(USE_SETPGRP_NOARGS)		/* SysV */
    (void) setpgrp();
#elif defined(USE_SETPGRP)		/* BSD */
    (void) setpgrp(0, getpid());
#else					/* POSIX */
    (void) setsid();
#endif

    close(pipes[0]);
    close(pipes[1]);
    close(pipes[2]);

    sys_sigrelease(SIGCHLD);

    if (args) {
        /* spawn_executable */
        execve(cmd, args, new_environ);
    } else {
        execle(SHELL, "sh", "-c", cmd, (char *) NULL, new_environ);
    }

    DEBUG_PRINT("exec error: %d",errno);
    _exit(errno);

child_error:
    fprintf(stderr,"erl_child_setup: failed with error %d on line %d\r\n",
            errno, errln);
    _exit(errno);
}


/*
 * [OTP-3906]
 * Solaris signal management gets confused when threads are used and a
 * lot of child processes dies. The confusion results in that SIGCHLD
 * signals aren't delivered to the emulator which in turn results in
 * a lot of defunct processes in the system.
 *
 * The problem seems to appear when a signal is frequently
 * blocked/unblocked at the same time as the signal is frequently
 * propagated. The child waiter thread is a workaround for this problem.
 * The SIGCHLD signal is always blocked (in all threads), and the child
 * waiter thread fetches the signal by a call to sigwait(). See
 * child_waiter().
 *
 * This should be a non-issue since the fork:ing was moved outside of
 * the emulator into erl_child_setup. I'm leaving the comment here
 * for posterity. */

static void handle_sigchld(int sig) {
    int buff[2], res, __preverrno = errno;

    sys_sigblock(SIGCHLD);

    while ((buff[0] = waitpid((pid_t)(-1), buff+1, WNOHANG)) > 0) {
        do {
            res = write(sigchld_pipe[1], buff, sizeof(buff));
        } while (res < 0 && errno == EINTR);
        if (res <= 0)
            ABORT("Failed to write to sigchld_pipe (%d): %d (%d)", sigchld_pipe[1], res, errno);
        DEBUG_PRINT("Reap child %d (%d)", buff[0], buff[1]);
    }

    sys_sigrelease(SIGCHLD);

    /* We save and restore the original errno as otherwise
       the thread we are running in may end up with an
       unexpected errno. An example of when this happened
       was when the select in main had gotten an EINTR but
       before the errno was checked the signal handler
       was called and set errno to ECHILD from waitpid
       which caused erl_child_setup to abort as it does
       not expect ECHILD to be set after select */
    errno = __preverrno;
}

#if defined(__ANDROID__)
static int system_properties_fd(void)
{
    static int fd = -2;
    char *env;

    if (fd != -2) return fd;
    env = getenv("ANDROID_PROPERTY_WORKSPACE");
    if (!env) {
        fd = -1;
        return -1;
    }
    fd = atoi(env);
    return fd;
}
#endif /* __ANDROID__ */

int
main(int argc, char *argv[])
{
    /* This fd should be open from beam */
    int uds_fd = 3, max_fd = 3;
#ifndef HAVE_CLOSEFROM
    int i;
#endif
    struct sigaction sa;

    if (argc < 1 || sscanf(argv[1],"%d",&max_files) != 1) {
        ABORT("Invalid arguments to child_setup");
    }

/* We close all fds except the uds from beam.
   All other fds from now on will have the
   CLOEXEC flags set on them. This means that we
   only have to close a very limited number of fds
   after we fork before the exec. */
#if defined(HAVE_CLOSEFROM)
    closefrom(4);
#else
    for (i = 4; i < max_files; i++)
#if defined(__ANDROID__)
        if (i != system_properties_fd())
#endif
        (void) close(i);
#endif

    if (pipe(sigchld_pipe) < 0) {
        ABORT("Failed to setup sigchld pipe (%d)", errno);
    }

    SET_CLOEXEC(sigchld_pipe[0]);
    SET_CLOEXEC(sigchld_pipe[1]);

    max_fd = max_fd < sigchld_pipe[0] ? sigchld_pipe[0] : max_fd;

    sa.sa_handler = &handle_sigchld;
    sigemptyset(&sa.sa_mask);
    sa.sa_flags = SA_RESTART | SA_NOCLDSTOP;
    if (sigaction(SIGCHLD, &sa, 0) == -1) {
        perror(NULL);
        exit(1);
    }

    /* Ignore SIGTERM.
       Some container environments send SIGTERM to all processes
       when terminating. We don't want erl_child_setup to terminate
       in these cases as that will prevent beam from properly
       cleaning up.
    */
    sa.sa_handler = SIG_IGN;
    sigemptyset(&sa.sa_mask);
    sa.sa_flags = 0;

    if (sigaction(SIGTERM, &sa, 0) == -1) {
        perror(NULL);
        exit(1);
    }

    forker_hash_init();

    SET_CLOEXEC(uds_fd);

    DEBUG_PRINT("Starting forker %d", max_files);

    while (1) {
        fd_set read_fds;
        int res;
        FD_ZERO(&read_fds);
        FD_SET(uds_fd, &read_fds);
        FD_SET(sigchld_pipe[0], &read_fds);
        DEBUG_PRINT("child_setup selecting on %d, %d (%d)",
                uds_fd, sigchld_pipe[0], max_fd);
        res = select(max_fd+1, &read_fds, NULL, NULL, NULL);

        if (res < 0) {
            if (errno == EINTR) continue;
            ABORT("Select failed: %d (%d)",res, errno);
        }

        if (FD_ISSET(uds_fd, &read_fds)) {
            int pipes[3], res, os_pid;
            ErtsSysForkerProto proto;
            errno = 0;
            if ((res = sys_uds_read(uds_fd, (char*)&proto, sizeof(proto),
                                    pipes, 3, MSG_DONTWAIT)) < 0) {
                if (errno == EINTR)
                    continue;
                DEBUG_PRINT("erl_child_setup failed to read from uds: %d, %d", res, errno);
                _exit(0);
            }

            if (res == 0) {
                DEBUG_PRINT("uds was closed!");
                _exit(0);
            }
            /* Since we use unix domain sockets and send the entire data in
               one go we *should* get the entire payload at once. */
            ASSERT(res == sizeof(proto));
            ASSERT(proto.action == ErtsSysForkerProtoAction_Start);

            sys_sigblock(SIGCHLD);

            errno = 0;

            os_pid = fork();
            if (os_pid == 0)
                start_new_child(pipes);

            add_os_pid_to_port_id_mapping(proto.u.start.port_id, os_pid);

            /* We write an ack here, but expect the reply on
               the pipes[0] inside the fork */
            proto.action = ErtsSysForkerProtoAction_Go;
            proto.u.go.os_pid = os_pid;
            proto.u.go.error_number = errno;
            while (write(pipes[1], &proto, sizeof(proto)) < 0 && errno == EINTR)
                ; /* remove gcc warning */

#ifdef FORKER_PROTO_START_ACK
            proto.action = ErtsSysForkerProtoAction_StartAck;
            while (write(uds_fd, &proto, sizeof(proto)) < 0 && errno == EINTR)
                ; /* remove gcc warning */
#endif

            sys_sigrelease(SIGCHLD);
            close(pipes[0]);
            close(pipes[1]);
            close(pipes[2]);
        }

        if (FD_ISSET(sigchld_pipe[0], &read_fds)) {
            int ibuff[2];
            ErtsSysForkerProto proto;
            res = read(sigchld_pipe[0], ibuff, sizeof(ibuff));
            if (res <= 0) {
                if (errno == EINTR)
                    continue;
                ABORT("Failed to read from sigchld pipe: %d (%d)", res, errno);
            }

            proto.u.sigchld.port_id = get_port_id((pid_t)(ibuff[0]));

            if (proto.u.sigchld.port_id == THE_NON_VALUE)
                continue; /* exit status report not requested */

            proto.action = ErtsSysForkerProtoAction_SigChld;
            proto.u.sigchld.error_number = ibuff[1];
            DEBUG_PRINT("send sigchld to %d (errno = %d)", uds_fd, ibuff[1]);
            if (write(uds_fd, &proto, sizeof(proto)) < 0) {
                if (errno == EINTR)
                    continue;
                /* The uds was close, which most likely means that the VM
                   has exited. This will be detected when we try to read
                   from the uds_fd. */
                DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno);
            }
        }
    }
    return 1;
}

typedef struct exit_status {
    HashBucket hb;
    pid_t os_pid;
    Eterm port_id;
} ErtsSysExitStatus;

static Hash *forker_hash;

static void add_os_pid_to_port_id_mapping(Eterm port_id, pid_t os_pid)
{
    if (port_id != THE_NON_VALUE) {
        /* exit status report requested */
        ErtsSysExitStatus es;
        es.os_pid = os_pid;
        es.port_id = port_id;
        hash_put(forker_hash, &es);
    }
}

static Eterm get_port_id(pid_t os_pid)
{
    ErtsSysExitStatus est, *es;
    Eterm port_id;
    est.os_pid = os_pid;
    es = hash_remove(forker_hash, &est);
    if (!es) return THE_NON_VALUE;
    port_id = es->port_id;
    free(es);
    return port_id;
}

static int fcmp(void *a, void *b)
{
    ErtsSysExitStatus *sa = a;
    ErtsSysExitStatus *sb = b;
    return !(sa->os_pid == sb->os_pid);
}

static HashValue fhash(void *e)
{
    ErtsSysExitStatus *se = e;
    Uint32 val = se->os_pid;
    val = (val+0x7ed55d16) + (val<<12);
    val = (val^0xc761c23c) ^ (val>>19);
    val = (val+0x165667b1) + (val<<5);
    val = (val+0xd3a2646c) ^ (val<<9);
    val = (val+0xfd7046c5) + (val<<3);
    val = (val^0xb55a4f09) ^ (val>>16);
    return val;
}

static void *falloc(void *e)
{
    ErtsSysExitStatus *se = e;
    ErtsSysExitStatus *ne = malloc(sizeof(ErtsSysExitStatus));
    ne->os_pid = se->os_pid;
    ne->port_id = se->port_id;
    return ne;
}

static void *meta_alloc(int type, size_t size) { return malloc(size); }
static void meta_free(int type, void *p)       { free(p); }

static int forker_hash_init(void)
{
    HashFunctions forker_hash_functions;
    forker_hash_functions.hash = fhash;
    forker_hash_functions.cmp = fcmp;
    forker_hash_functions.alloc = falloc;
    forker_hash_functions.free = free;
    forker_hash_functions.meta_alloc = meta_alloc;
    forker_hash_functions.meta_free  = meta_free;
    forker_hash_functions.meta_print = NULL;

    forker_hash = hash_new(0, "forker_hash",
                           16, forker_hash_functions);

    return 1;
}