http://qs321.pair.com?node_id=832351


in reply to Re: No child processes - system limit?
in thread No child processes - system limit?

At the suggestion of moritz, I ran the script with strace, the relevant bits of which are as follows:
select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0} +) read(3, "e_type\":\"all_birthday\",\"ancestor"..., 4096) = 4096 mremap(0xb7a16000, 3756032, 3760128, MREMAP_MAYMOVE) = 0xb7a16000 time(NULL) = 1270142654 select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0} +) read(3, "us\":\"active\",\"last_modified\":\"20"..., 4096) = 4096 mremap(0xb7a16000, 3760128, 3764224, MREMAP_MAYMOVE) = 0xb7a16000 time(NULL) = 1270142654 select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0} +) read(3, ",\"text\":\"RUSSELL Nee Viney Nigel"..., 4096) = 4096 mremap(0xb7a16000, 3764224, 3768320, MREMAP_MAYMOVE) = 0xb7a16000 time(NULL) = 1270142654 select(8, [3], NULL, NULL, {180, 0}) = 1 (in [3], left {180, 0} +) read(3, "8\",\"featured\":0,\"sub_type\":\"memo"..., 836) = 836 time(NULL) = 1270142654 time(NULL) = 1270142654 mmap2(NULL, 3768320, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMO +US, -1, 0) = 0xb767e000 munmap(0xb6a62000, 4059136) = 0 mmap2(NULL, 3768320, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMO +US, -1, 0) = 0xb6e88000 munmap(0xb5d2a000, 4059136) = 0 mmap2(NULL, 3768320, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMO +US, -1, 0) = 0xb6af0000 munmap(0xb767e000, 3768320) = 0 munmap(0xb7a16000, 3768320) = 0 write(1, "\n", 1) = 1 write(1, " - objects 225001 .. 230000", 27) = 27 clone(child_stack=0, flags=CLONE_CHILD_CLEARTID|CLONE_CHILD_SETTID +|SIGCHLD, child_tidptr=0xb7f7db78) = 14233 time(NULL) = 1270142654 rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0 rt_sigaction(SIGCHLD, NULL, {0xe5c500, [], 0}, 8) = 0 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 nanosleep({1, 0}, 0xbfb45ea4) = ? ERESTART_RESTARTBLOCK +(To be restarted) --- SIGCHLD (Child exited) @ 0 (0) --- sigreturn() = ? (mask now []) time(NULL) = 1270142654 rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0 waitpid(14232, 0xbfb45be8, WNOHANG) = 0 waitpid(14224, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) = + 14224 waitpid(14233, 0xbfb45be8, WNOHANG) = 0 waitpid(14225, 0xbfb45be8, WNOHANG) = 0 waitpid(14228, 0xbfb45be8, WNOHANG) = 0 waitpid(14229, 0xbfb45be8, WNOHANG) = 0 waitpid(14226, 0xbfb45be8, WNOHANG) = 0 waitpid(14230, 0xbfb45be8, WNOHANG) = 0 waitpid(14231, 0xbfb45be8, WNOHANG) = 0 waitpid(14227, 0xbfb45be8, WNOHANG) = 0 rt_sigprocmask(SIG_BLOCK, [CHLD], [CHLD], 8) = 0 rt_sigaction(SIGCHLD, {0xe5c500, [], 0}, {0xe5c500, [], 0}, 8) = 0 rt_sigprocmask(SIG_SETMASK, [CHLD], NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0 write(1, "\n", 1) = 1

Here is where the parent child makes the request:

select(8, [3], NULL, NULL, {0, 0}) = 0 (Timeout) time(NULL) = 1270142654 select(8, [3], [3], NULL, {180, 0}) = 1 (out [3], left {180, 0 +}) write(3, "GET /ia_object/_search?searchTyp"..., 246) = 246 time(NULL) = 1270142654 select(8, [3], NULL, NULL, {180, 0}) = ? ERESTARTNOHAND (To be +restarted) --- SIGCHLD (Child exited) @ 0 (0) --- sigreturn() = ? (mask now []) rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0 waitpid(14232, 0xbfb45be8, WNOHANG) = 0 waitpid(14233, 0xbfb45be8, WNOHANG) = 0 waitpid(14225, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) = + 14225 waitpid(14228, 0xbfb45be8, WNOHANG) = 0 waitpid(14229, 0xbfb45be8, WNOHANG) = 0 waitpid(14226, 0xbfb45be8, WNOHANG) = 0 waitpid(14230, 0xbfb45be8, WNOHANG) = 0 waitpid(14231, 0xbfb45be8, WNOHANG) = 0 waitpid(14227, 0xbfb45be8, WNOHANG) = 0 rt_sigprocmask(SIG_BLOCK, [CHLD], [CHLD], 8) = 0 rt_sigaction(SIGCHLD, {0xe5c500, [], 0}, {0xe5c500, [], 0}, 8) = 0 rt_sigprocmask(SIG_SETMASK, [CHLD], NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0 time(NULL) = 1270142662 time(NULL) = 1270142662 select(8, [3], NULL, NULL, {172, 0}) = ? ERESTARTNOHAND (To be +restarted) --- SIGCHLD (Child exited) @ 0 (0) --- sigreturn() = ? (mask now []) rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0 waitpid(14232, 0xbfb45be8, WNOHANG) = 0 waitpid(14233, 0xbfb45be8, WNOHANG) = 0 waitpid(14225, 0xbfb45be8, WNOHANG) = -1 ECHILD (No child proc +esses) waitpid(14228, 0xbfb45be8, WNOHANG) = 0 waitpid(14229, 0xbfb45be8, WNOHANG) = 0 waitpid(14226, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) = + 14226 waitpid(14230, 0xbfb45be8, WNOHANG) = 0 waitpid(14231, 0xbfb45be8, WNOHANG) = 0 waitpid(14227, 0xbfb45be8, WNOHANG) = 0 rt_sigprocmask(SIG_BLOCK, [CHLD], [CHLD], 8) = 0 rt_sigaction(SIGCHLD, {0xe5c500, [], 0}, {0xe5c500, [], 0}, 8) = 0 rt_sigprocmask(SIG_SETMASK, [CHLD], NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [CHLD], NULL, 8) = 0 close(3) = 0 time(NULL) = 1270142665 time(NULL) = 1270142665 stat64("/opt/apache/sites/IA/perl/HTTP/Headers/Util.pmc", 0xbfb45d +9c) = -1 ENOENT (No such file or directory) stat64("/opt/apache/sites/IA/perl/HTTP/Headers/Util.pm", 0xbfb45cb +0) = -1 ENOENT (No such file or directory) stat64("/opt/apache/sites/Burro/HTTP/Headers/Util.pmc", 0xbfb45d9c +) = -1 ENOENT (No such file or directory) stat64("/opt/apache/sites/Burro/HTTP/Headers/Util.pm", 0xbfb45cb0) + = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/5.8.9/i686-linux/HTTP/Headers/Util.pmc +", 0xbfb45d9c) = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/5.8.9/i686-linux/HTTP/Headers/Util.pm" +, 0xbfb45cb0) = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/5.8.9/HTTP/Headers/Util.pmc", 0xbfb45d +9c) = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/5.8.9/HTTP/Headers/Util.pm", 0xbfb45cb +0) = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/i686-linux/HTTP/Header +s/Util.pmc", 0xbfb45d9c) = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/i686-linux/HTTP/Header +s/Util.pm", 0xbfb45cb0) = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/HTTP/Headers/Util.pmc" +, 0xbfb45d9c) = -1 ENOENT (No such file or directory) stat64("/opt/perl-5.8.9/lib/site_perl/5.8.9/HTTP/Headers/Util.pm", + {st_mode=S_IFREG|0444, st_size=4887, ...}) = 0 open("/opt/perl-5.8.9/lib/site_perl/5.8.9/HTTP/Headers/Util.pm", O +_RDONLY|O_LARGEFILE) = 3 ioctl(3, SNDCTL_TMR_TIMEBASE or TCGETS, 0xbfb45ad8) = -1 ENOTTY (I +nappropriate ioctl for device) _llseek(3, 0, [0], SEEK_CUR) = 0 read(3, "package HTTP::Headers::Util;\n\nus"..., 4096) = 4096 _llseek(3, 1712, [1712], SEEK_SET) = 0 _llseek(3, 0, [1712], SEEK_CUR) = 0 close(3) = 0 munmap(0xb6af0000, 3768320) = 0 munmap(0xb6e88000, 3768320) = 0

At this stage, my code catches the select failed: no child processes error in an eval, issues a warning, then sleeps before retrying:

write(2, "\nSystem is busy - trying again\n", 31) = 31 time(NULL) = 1270142665 rt_sigprocmask(SIG_BLOCK, [CHLD], [], 8) = 0 rt_sigaction(SIGCHLD, NULL, {0xe5c500, [], 0}, 8) = 0 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 nanosleep({3, 0}, 0xbfb45ea4) = ? ERESTART_RESTARTBLOCK +(To be restarted)

I'm not sure what most of this means, but is the value of $! being set to "no child processes" by one of my waitpid calls, which is interfering with the code in LWP::Protocol::http? Would it help if I localised $! in my reaper sub?

Replies are listed 'Best First'.
Re^3: No child processes - system limit?
by ikegami (Patriarch) on Apr 01, 2010 at 18:09 UTC

    Would it help if I localised $! in my reaper sub?

    I believe so. That's exactly where I was going with my question.

Re^3: No child processes - system limit?
by almut (Canon) on Apr 01, 2010 at 19:15 UTC
    select(8, [3], NULL, NULL, {172, 0}) = ? ERESTARTNOHAND (To be rest +arted) --- SIGCHLD (Child exited) @ 0 (0) --- sigreturn() = ? (mask now []) rt_sigprocmask(SIG_BLOCK, [CHLD], NULL, 8) = 0 waitpid(14232, 0xbfb45be8, WNOHANG) = 0 waitpid(14233, 0xbfb45be8, WNOHANG) = 0 waitpid(14225, 0xbfb45be8, WNOHANG) = -1 ECHILD (No child processe +s) ...

    My interpretation of this would be (as you already figured) that $! is being modified in the signal handler before the interrupted select call gets a chance to be restarted, i.e. the redo SELECT doesn't execute because of that very modification of $!.

    (Note that because of Perl's deferred (aka safe) signal handling, the sigreturn() (which is being called at the end of the "real" system/C-level signal handler) happens immediately, before the Perl signal handler runs all the waitpid calls. Still, they do run before the next Perl opcode executes (which means this is presumably before if ($!{EINTR} || $!{EAGAIN}) ).

    What I find a little surprising is that the ECHILD does occur at all, because your $Children{$pid} should've been set to zero in the previous call to the signal handler

    waitpid(14225, [{WIFEXITED(s) && WEXITSTATUS(s) == 0}], WNOHANG) = 142 +25

    where the waitpid did return 14225 (i.e. $res > 0). In other words, you shouldn't be calling waitpid(14225,...) again thereafter, because the 14225 is no longer supposed to be in the hash...  (update: err wait, this is nonsense of course, as you're iterating over the keys, not the values.  OTOH, this brings up the question what would happen if you did set the values to the PIDs, too, and then iterate over the values instead (as you seem be to getting that panic when deleting the keys...)

    Maybe you could try to figure out why this is — in addition to trying to localize $! as a workaround, of course.

      local'ising $! seems to have sorted out that issue, revealing the real error that is happening on the remote process.

      Re your other point, yes - deleting keys in the hash causes a panic, but I'll change the loop to only waitpid to those keys that have true values, which should help

      thanks