What happens is when a kill 9 is issued against the hung ssh/scp process, it does not always terminate the ssh/scp process. The actual code is rather length. So, I am going to try to boil it down. Please note that the second "some-perl14-script" is a result of the "defunct" and if I kill that one, the defunct processes clear. This was not an issue under perl 5.12, and only started under 5.14. So I will assume that I am not doing something correct with the kill.
PROCESS TABLE EXAMPLE:
PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND
13124 uasbatch 16 0 1054m 347m 2568 S 1 11.4 6:50.22 some-perl
+14-script.pl
3385 someuser 16 0 0 0 0 Z 0 0.0 0:00.00 ssh <defu
+nct>
3386 someuser 16 0 0 0 0 Z 0 0.0 0:00.01 ssh <defu
+nct>
3387 someuser 17 0 296m 186m 352 S 0 6.2 0:00.00 some-perl
+14-script.pl
For this I am going to leverage the code that you showed me earlier in the year. I think it may be easier, and I believe the problem would still be there.
#!/opt/PerlDirect/1204/x86_64/bin/perl -slw
use strict;
use threads;
use threads::shared;
use Thread::Queue;
use IPC::Open3;
use FileHandle;
use POSIX qw(:errno_h :sys_wait_h);
use constant {
RANDOM => 15,
THREADS => 3,
JOBS => 20,
TIMEOUT => 2,
PW_POLL => 1,
};
my $Q = new Thread::Queue;
our %PROC_WATCH_CMD :shared; # watch external procs.
my $semSTD :shared;
sub tprint {
my $tid = threads->tid;
lock $semSTD;
print "[$tid] ", @_;
}
my $die_early :shared = 0;
$SIG{ INT } = $SIG{TERM} = $SIG{KILL} = sub {
tprint q{Early termination requested};
$Q->dequeue($Q->pending()) if ( $Q->pending() > 0 );
$Q->enqueue( (undef) x THREADS );
$die_early = 1;
};
my $semPROCKILL :shared = 0;
sub kill_pid {
lock $semPROCKILL;
return (kill('SIGKILL' => $_[0])) ? 1 : 0;
}
my $semPROCCHECK :shared = 0;
sub is_pid_alive {
lock $semPROCCHECK;
return (kill('SIGCHLD' => $_[0]) and ! $!{EPERM}) ? 1 : 0;
}
sub add_to_process_watch {
lock %PROC_WATCH_CMD;
## just assume that I am adding other items of interest
## to this shared clone, such as the cmd being run and
## how long I am allowing it to run. these items are
## omitted for now.
$PROC_WATCH_CMD{$_[0]} = shared_clone( { q{time} => time,
q{thr} => threads->ti
+d() } );
}
sub remove_from_process_watch {
lock %PROC_WATCH_CMD;
return if ( ! defined $_[0] or $_[0] eq q{} );
delete $PROC_WATCH_CMD{$_[0]} if ( defined $PROC_WATCH_CMD{$_[0]} );
}
my $semPW :shared = 0;
sub process_watcher {
lock $semPW;
while ( ! $die_early ) {
tprint q{process_watcher is running};
sleep( PW_POLL );
{
lock %PROC_WATCH_CMD;
tprint q{process_watcher is running};
foreach ( keys %PROC_WATCH_CMD ) {
unless ( is_pid_alive( $_ ) ) {
remove_from_process_watch( $_ ) if ( defined $PROC_WATCH_CMD
+{$_} );
next;
}
next unless( ( time - $PROC_WATCH_CMD{$_}{'time'} ) > TIMEOUT
+);
tprint q{PROCESS_TIMEOUT: } . $_;
kill_pid($_) if ( is_pid_alive($_) );
}
}
}
tprint q{process_watcher is finished};
return 1;
}
sub worker {
tprint q{worker started};
my( $Q ) = @_;
my $h = { q{stdin} => FileHandle->new,
q{stdout} => FileHandle->new,
q{stderr} => FileHandle->new };
while( !$die_early and defined( my $job = $Q->dequeue ) ) {
## we will use sleep here, in the real world this is an
## ssh/scp command.
tprint q{processing job: } . $job;
my $pid = open3( $h->{'stdin'}, $h->{'stdout'}, $h->{'stderr'}, q{
+sleep } . int(rand RANDOM) ) or die $!;
add_to_process_watch( $pid );
tprint q{waiting for pid: } . $pid;
waitpid $pid, 0;
tprint q{pid: } . $pid . q{ done};
remove_from_process_watch( $pid );
}
tprint q{Worker ending};
return 1;
}
## MAIN
$Q->enqueue( map "JOB-$_", 1 .. JOBS );
$Q->enqueue( (undef) x THREADS );
tprint q{Queue populated};
my $proc_watch = threads->new( \&process_watcher );
my @threads = map threads->new( \&worker, $Q ), 1 .. THREADS;
tprint q{Workers started; waiting...};
$_->join for @threads;
print q{Program complete};
--thanks!
|