Unfortunately the code I'm attempting to debug is extremely complicated but I've tried to simplify the issue here.
Parent process forks two children, one which is a PULL ZMQ and the other which is reading messages from a pipe (using select and a timeout) from the parent and PUSHing messages to the ZMQ child. All processes have a $SIG{DIE} handler that does this:
$SIG{__DIE__} = sub {
return if (defined($^S) && $^S); # in eval
return if !defined($^S); # parsing
if ($lh) { # this is an initialised Log::Log4perl handle logging
+to a file
local $Log::Log4perl::caller_depth = $Log::Log4perl::caller_de
+pth + 1;
$lh->fatal($_[0]);
}
};
If the child PUSHing messages to the ZMQ child calls die the handler is invoked and writes the fatal message to the log file but after that it hangs on restart_syscall (according to strace) and refuses to anything from that point on. The slightly strange thing is if you strace the dying child before it dies the strace does:
select(16, [8], NULL, NULL, {tv_sec=10, tv_usec=0}) = 0 (Timeout)
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=3687, ...}) = 0
getpid() = 11036
gettimeofday({tv_sec=1531217468, tv_usec=529330}, NULL) = 0
stat("/etc/localtime", {st_mode=S_IFREG|0644, st_size=3687, ...}) = 0
write(3, "11:11:08.529 FATAL 11036 L=564"..., 85) = 85
write(2, "FRED at scout.pl line 5646.\n", 28) = 28
HERE WE DIE
rt_sigaction(SIGHUP, NULL, {sa_handler=0x557e00140f70, sa_mask=[], sa_
+flags=SA_RESTORER, sa_restorer=0x7f9b8e5740c0}, 8) = 0
rt_sigaction(SIGHUP, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_REST
+ORER, sa_restorer=0x7f9b8e5740c0}, {sa_handler=0x557e00140f70, sa_mas
+k=[], sa_flags=SA_RESTORER, sa_restorer=0x7f9b8e5740c0}, 8) = 0
rt_sigaction(SIGINT, NULL, {sa_handler=0x557e00140f70, sa_mask=[], sa_
+flags=SA_RESTORER, sa_restorer=0x7f9b8e5740c0}, 8) = 0
rt_sigaction(SIGINT, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=SA_REST
+ORER, sa_restorer=0x7f9b8e5740c0}, {sa_handler=0x557e00140f70, sa_mas
+k=[], sa_flags=SA_RESTORER, sa_restorer=0x7f9b8e5740c0}, 8) = 0
rt_sigaction(SIGQUIT, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=
+0}, 8) = 0
rt_sigaction(SIGILL, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=0
+}, 8) = 0
rt_sigaction(SIGTRAP, NULL, {sa_handler=SIG_DFL, sa_mask=[], sa_flags=
+0}, 8) = 0
loads more of these then:
write(14, "\1\0\0\0\0\0\0\0", 8) = 8
getpid() = 11036
poll([{fd=9, events=POLLIN}], 1, -1
<code>
and it hangs here. However, if you quit strace and restart it, it then
+ says:<p>
<code>
strace: Process 4166 attached
restart_syscall(<... resuming interrupted poll ...>
This only happens when using ZMQ::FFI. If the code is run without the option to pass messages via ZMQ the process dies.
Any ideas?
EDIT: The following code exhibits the problem if you run it enough - I had to run it 4 or 5 times before it happened:
use 5.016;
use strict;
use warnings;
use ZMQ::FFI;
use ZMQ::FFI::Constants qw(ZMQ_PUSH ZMQ_PULL);
use POSIX qw(:sys_wait_h _POSIX_PIPE_BUF setsid _exit);
use IO::Select;
my ($zmq_admin_ctx, $zmq_admin_socket);
my ($terminate, $sigchild);
$SIG{__DIE__} = sub {
return if (defined($^S) && $^S); # in eval
return if !defined($^S); # parsing
say "DIE ($_[0])";
return;
};
$SIG{TERM} = sub {
local $!; # don't overwrite $!
say "$$ SIGTERM caught - terminating";
$terminate = 1;
};
my $pid = fork;
if (!defined $pid) {
die "Failed to fork - $!";
} elsif ($pid) { # parent
say "Started pull child $pid";
my ($reader, $writer);
if (!pipe($reader, $writer)) {
die "Failed pipe - $!";
}
$reader->autoflush(1);
$writer->autoflush(1);
my $pid2 = fork;
if (!defined($pid2)) {
die "Failed to fork - $!";
} elsif ($pid2) {
say "Started push child $pid2";
close $reader;
sub got_sigchild {
local $!; # don't overwrite $!
$sigchild++;
say "SIGCHLD Child has died";
};
$SIG{CHLD} = \&got_sigchild;
while (1) {
if ($terminate) {
close_admin_zmq();
say "Waiting on $pid";
my $s = waitpid($pid, 0);
say "waitpid returned $s";
exit(0);
} elsif ($sigchild) {
say "Waiting on $pid2";
my $s = waitpid($pid2, 0);
say "waitpid returned $s";
say "waiting 5s and killing pull child";
sleep 5;
kill 'TERM', $pid;
exit(0);
}
}
} else {
close $writer;
zmq_child_push($reader);
}
} else {
zmq_child_pull();
}
sub zmq_child_push {
my ($reader) = @_;
my $select = IO::Select->new();
$select->add(fileno $reader);
sleep 2;
($zmq_admin_ctx, $zmq_admin_socket) = open_admin_zmq();
while (1) {
my @ready = $select->can_read(2);
my $buf;
if (scalar(@ready)) {
my $read = sysread($reader, $buf, 64*1024, length($buf));
say "Read $read";
} else { # timeout
eval {
say "Sending";
$zmq_admin_socket->send("hello");
};
say "send failed - $@" if $@;
sleep 10;
say "push child dying";
die "FRED";
}
}
}
sub zmq_child_pull {
my $context = ZMQ::FFI->new();
my $receiver = $context->socket(ZMQ_PULL);
$receiver->bind('tcp://*:9990');
my $string;
while (1) {
if ($terminate) {
say "child got terminate";
last;
}
eval {
$string = $receiver->recv();
};
if (my $ev = $@) {
say "Failed to recv on zmq socket - $@";
next;
}
say "0MQ Received: $string";
}
$receiver->close;
$context->destroy;
say "child exiting";
exit(1);
}
sub open_admin_zmq {
say "open_admin_zmq";
my $ctx = ZMQ::FFI->new();
# Socket to send messages on
my $sender = $ctx->socket(ZMQ_PUSH);
$sender->connect('tcp://localhost:9990');
return ($ctx, $sender);
}
sub close_admin_zmq {
if ($zmq_admin_socket) {
say "close_admin_zmq";
$zmq_admin_socket->disconnect('tcp://localhost:9990');
$zmq_admin_socket->close;
$zmq_admin_ctx->destroy;
}
return;
}