asked on

Initializing daemon processes

My question is based on an exercise from W. Richard Stevens' "Unix Network Programming". In this book, Stevens presents code for a Unix domain stream server and client; see below. (The client and server perform a simple echo function). However, Stevens points out that the server doesn't do many of the things that a server should do, such as close all open file descriptors. To rectify this, he presents a function called daemon_start, also below. I've tried calling this function at various points within the server code, but while it does seem to do part of what one would expect it to do (for example, it appears to detach the server from the terminal), it's also apparent that the system soon becomes clogged with zombies, because everything grinds to a halt. It also prevents the echoing, mentioned above, from taking place. So: what change do I have to make to the server and/or client in order to accommodate the daemon_start function?

Thanks,

David King.

/*
* Example of server using UNIX domain stream protocol
*/

#include "uniks.h"
int readline(int, char *, int);
int writen(int, char *, int);

main(int argc, char *argv[])
{
      int                  sockfd, newsockfd, clilen, childpid, servlen;
      struct sockaddr_un      cli_addr, serv_addr;

      pname = argv[0];

      /*
       * Open a socket (a UNIX domain stream socket).
       */

      if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
            printf("server: can't open stream socket\n");

      /*
       * Bind our local address so that the client can send to us.
       */

      bzero((char *) &serv_addr, sizeof(serv_addr));
      serv_addr.sun_family = AF_UNIX;
      strcpy(serv_addr.sun_path, UNIXSTR_PATH);
      servlen = sizeof(serv_addr.sun_family) + strlen(serv_addr.sun_path);

      if (bind(sockfd, (struct sockaddr *) &serv_addr, servlen) < 0)
            printf("server: can't bind local address\n");

      listen(sockfd, 5);
      for ( ; ;) {
      clilen = sizeof(cli_addr);
      if ((newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen)) < 0)
            printf("server: accept error\n");

      if (childpid = fork() < 0)
            printf("server: fork error\n");
      else if (childpid == 0)      {      /* child process */
      close(sockfd);
             str_echo(newsockfd);
      exit(0);
                        }
      close(newsockfd);
            }
}

/*
* Read a stream socket one line at a time, and write each line back
* to the sender. Return when the connection is terminated.
*/

#define MAXLINE 512

str_echo(int sockfd)
{
      int n;
      char line[MAXLINE];

      for( ; ; )
            {
      n = readline(sockfd, line, MAXLINE);
            if (n == 0)
                  return;            /* connection terminated */
            else if (n < 0)
                  printf("str_echo: readline error\n");

            if (writen(sockfd, line, n) != n)
                  printf("str_echo: writen error\n");
            }
}

/*
* Read a line from a descriptor. Read the line one byte at a time,
* looking for the newline. We store the newline in the buffer, then
* follow it with a null. We return the number of characters up to but
* not including the null.
*/

int readline(fd, ptr, maxlen)
register int      fd;
register char      *ptr;
register int       maxlen;
{
      int n, rc;
      char c;

      for(n = 1; n < maxlen; n++)
                              {
            if ((rc = read(fd, &c, 1)) == 1)
                                    {
                                    *ptr++ = c;
                                    if (c == '\n')
                                          break;

                                    }
            else if (rc == 0)
                                    {
                                    if (n == 1)
                                          return(0); /* EOF, no data read */
                                    else
                                          break;
                                    }
            else
                  return(-1);            /* error */
                              }
            *ptr = 0;
            return(n);
}


/* write n bytes to a descriptor. Use in place of write()
* when fd is a stream socket.
*/

int writen(register int fd, register char *ptr, register int nbytes)
{
int      nleft, nwritten;

nleft = nbytes;
while (nleft > 0)
            {
            nwritten = write(fd, ptr, nleft);
            if (nwritten <= 0)
                  return(nwritten);            /* error */

            nleft -= nwritten;
            ptr += nwritten;
            }
      return(nbytes - nleft);
}

/*
* Example of client using UNIX domain stream protocol
*/

#include <stdio.h>
#include "uniks.h"
int writen(register int, register char *, register int);
int readline(int, char *, int);

main(int argc, char *argv[])
{
      int                  sockfd, servlen;
      struct sockaddr_un      serv_addr;


      pname = argv[0];

      /*
       * Fill in the structure "serv_addr" with the address of the
       * server that we want to send to.
       */

      bzero((char *) &serv_addr, sizeof(serv_addr));
      serv_addr.sun_family = AF_UNIX;
      strcpy(serv_addr.sun_path, UNIXSTR_PATH);
      servlen = strlen(serv_addr.sun_path) + sizeof(serv_addr.sun_family);

      /* Open a UNIX domain stream socket */

      if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
            printf("client: can't open stream socket\n");

      /*
       * Connect to the server
       */

      if (connect(sockfd, (struct sockaddr *) &serv_addr, servlen) < 0)
            printf("client: can't connect to server\n");
      if (!(fp = fopen("filetoread.txt", "r")))
            {
            printf("Can't open file\n");
            exit(1);
            }
      str_cli(stdin, sockfd);            /* do it all */

      close(sockfd);
      exit(0);
}

#include <signal.h>
#include <sys/param.h>

#ifdef SIGTSTP
#include <sys/file.h>
#include <sys/ioctl.h>
#endif

/*
* Detach a daemon process from login session context.
*/

daemon_start(int ignsigcld)      /* nonzero -> handle SIGCLDs so zombies don't clog */
{

      register int childpid, fd;

/*
* If we were started by init (process 1) from the /etc/inittab file
* there's no need to detach. This test is unreliable owing to an
* unavoidable ambiguity if the process is started by some other
* process and orphaned (i.e., if the parent process terminates before
* we are started.
*/

      if (getppid() == 1)
            goto out;

      /* Ignore the terminal stop signals (BSD) */

#ifdef SIGTTOU
      signal(SIGTTOU, SIG_IGN);
#endif
#ifdef SIGTTIN
      signal(SIGTTIN, SIG_IGN);
#endif
#ifdef SIGTSTP
      signal(SIGTSTP, SIG_IGN);
#endif

      /* If we were not started in the background, fork and
       * let the parent exit. This also guarantees the first
       * child is not a process group leader.
       */

      if ((childpid = fork()) < 0)
            printf("Can't fork first child\n");
      else if (childpid > 0)
            exit(0);       /* parent */

      /*
       * First child process. Dissociate from controlling terminal
       * and process group. Ensure the process can't reacquire a new
       * controlling terminal.
      */

#ifdef SIGTSTP            /* BSD */

      if(setpgrp(0, getpid()) == -1)       /* process will be a leader in its own group */
            printf("Can't change process group\n");

      if ((fd = open("/dev/tty", O_RDWR)) >= 0)
            {
            ioctl(fd, TIOCNOTTY, (char *)NULL);      /* lose controlling tty */
            close(fd);
            }
#else                  /* System V */

      if(setpgrp() == -1)
            printf("Can't change process group\n");

      signal(SIGHUP, SIG_IGN);      /* immune from pgrp leader death */

      if ((childpid = fork()) < 0)
            printf("Can't fork second child\n");
      else if (childpid > 0)
            exit(0)            /* first child */

      /* second child */
#endif

out:

      /*
       * Close any open file descriptors.
       */

      for (fd = 0; fd < NOFILE; fd++)
            close(fd);

      /* Move the current directory to root, to make sure we
       * aren't on a mounted filesystem.
       */

      chdir("/");

      /*
       * Clear any inherited file mode creation mask.
       */

      umask(0);

      /*
       * See if the caller isn't interested in the exit status of its
       * children, and doesn't want to have them become zombies and
       * clog up the system. With System V all we need do is ignore
       * the signal. With BSD, however, we have to catch each signal
       * and execute the wait3() system call.
      */

      if (ignsigcld)
            {
#ifdef SIGTSTP
            int sig_child();
            signal(SIGCLD, sig_child);      /* BSD */
#else
            signal(SIGCLD, SIG_IGN);      /* System V */
#endif
            }
}

/* This is a 4.3BSD SIGCLD signal handler that can be used by a server
* that's not interested in its child's exist status, but needs to
* wait for them, to avoid clogging up the system with zombies.
* Beware that the calling process may get an interrupted system call
* when we return, so they had better handle that.
*/

#include "systype.h"

sig_child()
{
#ifdef BSD
            /*
             * Use the wait3() system call with the WNOHANG option.
              */

      int pid;
      union wait status;

      while ((pid = wait3(&status, WNOHANG, (struct rusage *) 0)) > 0)
            ;
#endif
}

ecw

1) Change
if (childpid = fork() < 0)
to
if ((childpid = fork()) < 0)
and near the start of main,
signal(SIGCHLD, SIG_IGN);

sevrin

ASKER

Whoops... the childpid=fork line was a typo. The problem isn't a coding error: Stevens, in his exercise, says: "The example server does not call the daemon_start function. Modify the server
to call this function. What other changes do you have to make to the server?" Are you saying that the only change that has to be made is for SIGCHLD to be ignored? But if that's so, why did Stevens build so much flexibility into the function's signal-handling ability?

ecw

Ignoring SIGCHLD is a SYSV ism, use it if you can. Now, call
daemon_start(1);
between
listen(sockfd, 5);
and
for ( ; ;) {

ecw

Sorry, meant to say on BSD systems, ensure SIGCHLD (SIGCLD) is handled by sig_child, on SYSV this isn't needed, just ignore the signal. Note, definition og SIGTSTP is not enough to determine whether you're on BSD, it's present on moden SYSV systems, and on SYSV, signal(2) is unreliable and must be re-initialised each time the handler is invoked, so change
#ifdef SIGTSTP
int sig_child();
signal(SIGCLD, sig_child); /* BSD */
#else
signal(SIGCLD, SIG_IGN); /* System V */
#endif
to something like
#if defined(BSD) || defined(_BSD)
int sig_child();
signal(SIGCLD, sig_child); /* BSD */
#else
signal(SIGCLD, SIG_IGN); /* System V */
#endif

sevrin

ASKER

This makes no difference to the operation of the program - it still clogs with zombies. Stevens' question obviously invites some more fundamental modification. I wish I knew what....

ecw

Ok. if a ps listing shows the expired children as zombie rather than defunct, I'll assume that you're using a BSD, in which case the signal handler must be called on child death. Check that the code compiled calls "signal(SIGCLD, sig_child);", and check that the sig_child is called on the death of the first and second child. If it isn't called the second time round, you're using a mixed SYSV/BSD system, so re-initialise the signal handler when you enter sig_child (call "signal(SIGCLD, sig_child);" again).

Then again, you may need to link your code against a BSD compatability library, check whether one is inculded on your system.

I haven't used BSD for over 8 years, so excuse me if I'm wrong.

elfie

If child processes doesn't go away, the parent process need to 'wait' for them. Check that the wait call is executed.

If it is executed, check to options passed (sorry, don't have access to the right machine, so can't test it)

ecw

On SYSV a parent does not have to wait for dead children if the parent has ignored SIGCHLD. On BSD each and every child must be waited for either using wait(2) or wait3(2).

I've noticed a mistake in my previous comment though, in a mixed SYSV/BSD system, the re-initialise the signal handler after calling wait3, ie. just before the handler returns.

sevrin

ASKER

I'm using Linux. Also, the echoing function doesn't work at all. I'll check your suggestions now.

dhm

Elfie & ecw have most of the pieces of the answer, but perhaps not expressed clearly enough. Here's my attempt:

A zombie process is one which has exited, but whose parent is still running and has not called one of the wait() functions to retrieve the child's exit status. I don't think zombies take up much in the way of system resources (memory, file descriptors, and so on), but they *do* use up a process slot. If you have too many of them hanging around, you'll find that you can't log in or start new programs, because the system can't find a free slot to store information about the new process. (Sometimes there are a few process slots reserved for root, so you can recover the system if root is already logged in, but often people end up just rebooting the machine.)

If a parent process exits before its children do, then the children are "orphaned" and get adopted by process 1, the init process. One of init's jobs is to collect the exit status of all the children it adopts and thereby prevent the accumulation of zombies.

There are two ways to prevent zombies if your parent process is supposed to hang around for a long time. First, as ecw says, is to use signal(3) to ignore the SIGCHLD signal. If you do this, zombies are not [supposed to be] created. This may only be true for SYSV; I don't have a BSD system around to check. This method has the drawback ecw mentions: unix signals are pretty complex, and it can be hard to be sure you're catching them all -- and even harder to debug once you figure out that you're not.

The second (and *much* better) way is to call wait(2) to retrieve child processes' exit statuses. For example, the following code included in the daemon's main loop will take care of zombies:

int status;
while (waitpid( (pid_t)-1, &status, WNOHANG ) != (pid_t)-1)
printf( "child process exited with status %d\n", status );

A good place to add this code is right after the close(newsockfd) in your server program.

For the answer to the other part of your question, "when should I call daemon_start()?", we have to look at what daemon_start() does. One thing that jumps right out is that it closes all open file descriptors. Therefore, if you call it when ecw suggests -- after the listen() call, you'll close the socket you had planned to accept connections on. That is not good. You should call it *before* you open the socket. The only downside to calling it that soon is that you end up closing stdout and stderr, so you won't see any messages from the program. (Or worse, the messages will be sent to the client!) In a real daemon, you have to be careful about logging messages to make sure they go where somebody will see them. Most daemons use syslog, write to a log file themselves, or both.

Hope this helps.

sevrin

ASKER

Thanks, DHM; your comments are very clear. I have some further questions, though:
1) Why are zombies being created anyway? I thought the signal handler already dealt with that possibility. Isn't the wait3() system call sufficient?
2) The code fragment you suggest, invoking the wait system call, seems to be corrupt. What I read is:

int status;
while (waitpid( (pid_t)-1, &status, WNOHANG ) != (pid_t)-1)
printf( "child process exited with status %d\n", status );

I don't know what "waitpid" is; also, according to my book, wait takes a single argument - a pointer to an integer. There seems to be nothing satisfying these criteria in the code fragment.

Your reasons as to why the code fragment should be put after close(newsockfd) are very clear - thank you.

sevrin

ASKER

Thanks, DHM; your comments are very clear. I have some further questions, though:
1) Why are zombies being created anyway? I thought the signal handler already dealt with that possibility. Isn't
the wait3() system call sufficient?
2) The code fragment you suggest, invoking the wait system call, seems to be corrupt. What I read is:

int status;
while (waitpid( (pid_t)-1, &status, WNOHANG ) != (pid_t)-1)
printf( "child process exited with status %d\n", status );

I don't know what "waitpid" is; also, according to my book, wait takes a single argument - a pointer to an
integer. There seems to be nothing satisfying these criteria in the code fragment.

Your reasons as to why the code fragment should be put after close(newsockfd) are very clear - thank you.

sevrin

ASKER

See above.

dhm

Sorry I didn't respond earlier...I guess EE changed the "email notification" default, so I didn't know there was any activity on this question until you rejected my answer.

For your first question: Signal handlers are tricky to get installed permanently (sometimes the handler gets reset to the default after a signal is caught), and the default for SIGCHLD is to ignore it, so you might not even know your handler is not being called. Also, my Solaris docs _claim_ that zombie processes are not created if SIGCHLD is ignored, but I believe that claim is false, and all my daemons are coded to collect
exit statuses and thereby clean up any zombies.

Second, waitpid() is a variant of the wait() call. wait() will block until a child process' status is available, so if your children are not finished yet (i.e. you have no zombies to clean up) then wait() will stop your process until a child does finish. The waitpid() call allows you to control the waiting better: I pass the WNOHANG option which means "get the exit status of an exited child if one is available, or else return an error immediately." If your system doesn't have waitpid(), then wait3() will allow you to do the same thing (in fact, in the sig_child() routine at the very end of your question, you see this idiom:

while ((pid = wait3(&status, WNOHANG, (struct rusage *) 0)) > 0)
;

which is equivalent to the code I suggested. If you move it out of the signal handler and put it somewhere in your main loop instead, it should do the trick.

sevrin

ASKER

These suggestions make no difference: zombies are still clearly being produced (to the extent that I can't execute the client and test whether the echoing function works).

But given the phrasing of the question in Stevens' book "Unix Network Programming", where, as I say, this code comes from, I feel this isn't surprising. Stevens is clearly suggesting some fundamental modification of the server program is required, not just tinkering with the signal handling function. Stevens says:

The example servers in Section 6.6 do not call the daemon_start function which we developed in Section 2.6. Modify one of these servers to call this function. What other changes do you have to make to the server?

Can you think what these changes might be?

dhm

Do you mean that you can't even execute the client once? Are zombies (and a parent process) still hanging around from a previous run? Did you place the "while...wait3" loop somewhere in the main loop (for example, right after the close(newsockfd))? What version of Linux are you running? If you add a copy of your current code as a comment, I'll run it on a Linux box here and see what I can find out.

sevrin

ASKER

That's right: I can't execute the client once, now that the daemon_start function's involved. The zombies aren't hanging around from a previous run, as I have to reboot to get out of the system! The while... wait3 is where you suggested, as you'll see. Stevens' signal handler is still in place, but as that does the same thing I can't see it causing any problems. I'm running Linux 2.0.0, slakware. The current code is as below.

Thanks, dhm!

/*
* Example of server using UNIX domain stream protocol
*/
#include <sys/wait.h>
#include <signal.h>
#include "uniks.h"
#include "systype.h"
int readline(int, char *, int);
int writen(int, char *, int);

main(int argc, char *argv[])
{
      int                  pid, sockfd, newsockfd, clilen, childpid, servlen;
      struct sockaddr_un      cli_addr, serv_addr;
      union wait status;

      pname = argv[0];
      daemon_start(1);

      /*
       * Open a socket (a UNIX domain stream socket).
       */

      if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
            printf("server: can't open stream socket\n");

      /*
       * Bind our local address so that the client can send to us.
       */

      bzero((char *) &serv_addr, sizeof(serv_addr));
      serv_addr.sun_family = AF_UNIX;
      strcpy(serv_addr.sun_path, UNIXSTR_PATH);
      servlen = sizeof(serv_addr.sun_family) + strlen(serv_addr.sun_path);

      if (bind(sockfd, (struct sockaddr *) &serv_addr, servlen) < 0)
            printf("server: can't bind local address\n");

      listen(sockfd, 5);
      for ( ; ;) {
      clilen = sizeof(cli_addr);
      if ((newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen)) < 0)
            printf("server: accept error\n");

      if ((childpid = fork()) < 0)
            printf("server: fork error\n");
      else if (childpid == 0)      {      /* child process */
      close(sockfd);
             str_echo(newsockfd);
      exit(0);
                        }
      close(newsockfd);
      while ((pid = wait3(&status, WNOHANG, (struct rusage *)0)) >0)
      ;


            }
}

/*
* Read a stream socket one line at a time, and write each line back
* to the sender. Return when the connection is terminated.
*/

#define MAXLINE 512

str_echo(int sockfd)
{
      int n;
      char line[MAXLINE];

      for( ; ; )
            {
      n = readline(sockfd, line, MAXLINE);
            if (n == 0)
                  return;            /* connection terminated */
            else if (n < 0)
                  printf("str_echo: readline error\n");

            if (writen(sockfd, line, n) != n)
                  printf("str_echo: writen error\n");
            }
}

/*
* Read a line from a descriptor. Read the line one byte at a time,
* looking for the newline. We store the newline in the buffer, then
* follow it with a null. We return the number of characters up to but
* not including the null.
*/

int readline(fd, ptr, maxlen)
register int      fd;
register char      *ptr;
register int       maxlen;
{
      int n, rc;
      char c;

      for(n = 1; n < maxlen; n++)
                              {
            if ((rc = read(fd, &c, 1)) == 1)
                                    {
                                    *ptr++ = c;
                                    if (c == '\n')
                                          break;

                                    }
            else if (rc == 0)
                                    {
                                    if (n == 1)
                                          return(0); /* EOF, no data read */
                                    else
                                          break;
                                    }
            else
                  return(-1);            /* error */
                              }
            *ptr = 0;
            return(n);
}


/* write n bytes to a descriptor. Use in place of write()
* when fd is a stream socket.
*/

int writen(register int fd, register char *ptr, register int nbytes)
{
int      nleft, nwritten;

nleft = nbytes;
while (nleft > 0)
            {
            nwritten = write(fd, ptr, nleft);
            if (nwritten <= 0)
                  return(nwritten);            /* error */

            nleft -= nwritten;
            ptr += nwritten;
            }
      return(nbytes - nleft);
}

#include <signal.h>
#include <sys/param.h>

#ifdef SIGTSTP
#include <sys/file.h>
#include <sys/ioctl.h>
#endif

/*
* Detach a daemon process from login session context.
*/

daemon_start(int ignsigcld)      /* nonzero -> handle SIGCLDs so zombies don't clog */
{

      register int childpid, fd;

/*
* If we were started by init (process 1) from the /etc/inittab file
* there's no need to detach. This test is unreliable owing to an
* unavoidable ambiguity if the process is started by some other
* process and orphaned (i.e., if the parent process terminates before
* we are started.
*/

      if (getppid() == 1)
            goto out;

      /* Ignore the terminal stop signals (BSD) */

#ifdef SIGTTOU
      signal(SIGTTOU, SIG_IGN);
#endif
#ifdef SIGTTIN
      signal(SIGTTIN, SIG_IGN);
#endif
#ifdef SIGTSTP
      signal(SIGTSTP, SIG_IGN);
#endif

      /* If we were not started in the background, fork and
       * let the parent exit. This also guarantees the first
       * child is not a process group leader.
       */

      if ((childpid = fork()) < 0)
            printf("Can't fork first child\n");
      else if (childpid > 0)
            exit(0);       /* parent */

      /*
       * First child process. Dissociate from controlling terminal
       * and process group. Ensure the process can't reacquire a new
       * controlling terminal.
      */

#ifdef SIGTSTP            /* BSD */

      if(setpgrp(0, getpid()) == -1)       /* process will be a leader in its own group */
            printf("Can't change process group\n");

      if ((fd = open("/dev/tty", O_RDWR)) >= 0)
            {
            ioctl(fd, TIOCNOTTY, (char *)NULL);      /* lose controlling tty */
            close(fd);
            }
#else                  /* System V */

      if(setpgrp() == -1)
            printf("Can't change process group\n");

      signal(SIGHUP, SIG_IGN);      /* immune from pgrp leader death */

      if ((childpid = fork()) < 0)
            printf("Can't fork second child\n");
      else if (childpid > 0)
            exit(0)            /* first child */

      /* second child */
#endif

out:

      /*
       * Close any open file descriptors.
       */

      for (fd = 0; fd < NOFILE; fd++)
            close(fd);

      /* Move the current directory to root, to make sure we
       * aren't on a mounted filesystem.
       */

      chdir("/");

      /*
       * Clear any inherited file mode creation mask.
       */

      umask(0);

      /*
       * See if the caller isn't interested in the exit status of its
       * children, and doesn't want to have them become zombies and
       * clog up the system. With System V all we need do is ignore
       * the signal. With BSD, however, we have to catch each signal
       * and execute the wait3() system call.
      */

      if (ignsigcld)
            {
#if defined(BSD) || defined(_BSD)
            int sig_child();
            signal(SIGCLD, sig_child);      /* BSD */
#else
            signal(SIGCLD, SIG_IGN);      /* System V */
#endif
            }
}

/* This is a 4.3BSD SIGCLD signal handler that can be used by a server
* that's not interested in its child's exist status, but needs to
* wait for them, to avoid clogging up the system with zombies.
* Beware that the calling process may get an interrupted system call
* when we return, so they had better handle that.
*/

#include "systype.h"

sig_child()
{
#ifdef BSD
            /*
             * Use the wait3() system call with the WNOHANG option.
              */

      int pid;
      union wait status;

      while ((pid = wait3(&status, WNOHANG, (struct rusage *) 0)) > 0)
            ;
#endif
}

dhm

Ah yes, the old "don't bother doing anything about your error indicators" bug. I had to tweak the program a little to get it to compile (I didn't have your header files uniks.h and systype.h but I was able to figure out what must've been in them.) Anyway, I ran it once and it didn't do much, but at least it didn't hang my system. Then I figured I'd run it with the debugger, and >whoa<! Gazillions of copies of the server and the system slows to a crawl. So I go back and take a closer look. Here's the problem:

In the server, when you open the socket, bind the address, and accept client connections, if you get an error you just print a message and continue on. Since the daemon_start() function has closed all the file descriptors, the message never actually shows up; you might want to change the loop

      for (fd = 0; fd < NOFILE; fd++) close(fd);

to

      for (fd = 3; fd < NOFILE; fd++) close(fd);

This will leave stdin, stdout, and stderr open, so you'll have a better chance of seeing the error messages.

Anyway, as I was saying, even if you get an error, you just continue on. If you were to get an error in, say, the bind() function, then you'd have an unbound socket that you can't do accept() on. And, in fact, the way unix sockets work, if there's an old socket left around from the previous run of the program, the bind() call will fail. Therefore, the accept() call will also fail. Therefore, you'll cruise right into the fork() call and start up a process to serve the non-existent client. The child process will fail when it tries to read from the bad socket, and the parent process will eventually clean up the zombie. However, while the child process is starting, finding out about the bad socket, and stopping, the parent has madly continued creating more children. It's not the zombies that are clogging your system, it's the short-lived children.

So the answer to your problem is, check the results of your socket-related calls, and if they fail, exit the program (or figure out how to fix whatever the problem is, and try the call again.) Here's a copy of your main() with better error handling. If one of the socket-setup calls fails, I just exit, and if I get more than 10 errors while serving clients, I also exit. This kind of sanity checking can help prevent the runaway-daemon errors that often end up with a system reboot.

Good luck,

main(int argc, char *argv[])
{
      int pid, sockfd, newsockfd, clilen, childpid, servlen, nerrs;
      struct sockaddr_un cli_addr, serv_addr;
      union wait status;

      pname = argv[0];
      daemon_start(1);

      /*
       * Open a socket (a UNIX domain stream socket).
       */

      if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0) {
printf("server: can't open stream socket\n");
            exit( errno );
      }

      /*
       * Bind our local address so that the client can send to us.
       */

      bzero((char *) &serv_addr, sizeof(serv_addr));
      serv_addr.sun_family = AF_UNIX;
      strcpy(serv_addr.sun_path, UNIXSTR_PATH);
      servlen = sizeof(serv_addr.sun_family) + strlen(serv_addr.sun_path);

      if (bind(sockfd, (struct sockaddr *) &serv_addr, servlen) < 0) {
printf("server: can't bind local address\n");
            exit( errno );
      }

      listen(sockfd, 5);

      for ( nerrs = 0; nerrs < 10; ) {
clilen = sizeof(cli_addr);
if ((newsockfd = accept(sockfd, (struct sockaddr *) &cli_addr, &clilen)) < 0) {
                  printf("server: accept error\n");
                  ++nerrs;
                  continue;
            }

if ((childpid = fork()) < 0) {
                  printf("server: fork error\n");
                  ++nerrs;
            }
else if (childpid == 0) { /* child process */
                  close(sockfd);
                  str_echo(newsockfd);
                  exit(0);
}
close(newsockfd);
while ((pid = wait3(&status, WNOHANG, (struct rusage *)0)) >0)
                  ;
      }
}

sevrin

ASKER

Thanks for the tips on error-handling, dhm. For some reason my system doesn't have all the functions like perror that Stevens uses; that's why I just printed the messages. But while I'm sure you're right about handling errors, it doesn't answer the question Stevens poses: that is, what changes have to be made to the server program to accommodate daemon_start? I'm sure there must be more to it than just error handling.

sevrin

ASKER

dhm

I've compiled and run the modified server program you posted. It *does* get rid of the zombie/extra child problem - although I don't see how anticipating error conditions can do that! Did you modify the program in some other way? However, even though the server now seems to run, the client (code below) can't connect to it. In fact, it produces two errors: "can't connect to server" and "writen error". Any suggestions?

Best,

David

P.S. "thingo.txt" is simply a file to be echoed. The original version of the program, which I modified, took its input from standard input.

/*
* Example of client using UNIX domain stream protocol
*/

#include <stdio.h>
#include "uniks.h"
int writen(register int, register char *, register int);
int readline(int, char *, int);

main(int argc, char *argv[])
{
      int                  sockfd, servlen;
      struct sockaddr_un      serv_addr;
      FILE *fp;

      pname = argv[0];

      /*
       * Fill in the structure "serv_addr" with the address of the
       * server that we want to send to.
       */

      bzero((char *) &serv_addr, sizeof(serv_addr));
      serv_addr.sun_family = AF_UNIX;
      strcpy(serv_addr.sun_path, UNIXSTR_PATH);
      servlen = strlen(serv_addr.sun_path) + sizeof(serv_addr.sun_family);

      /* Open a UNIX domain stream socket */

      if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) < 0)
            printf("client: can't open stream socket\n");

      /*
       * Connect to the server
       */

      if (connect(sockfd, (struct sockaddr *) &serv_addr, servlen) < 0)
            printf("client: can't connect to server\n");
      if (!(fp = fopen("thingo.txt", "r")))
            {
            printf("Can't open file\n");
            exit(1);
            }
      str_cli(fp, sockfd);            /* do it all */

      close(sockfd);
      exit(0);
}

/* Read the contents of the FILE *fp, write each line to the
* stream socket (to the server process), then read a line back from
* the socket and write it to the standard output.
*
* Return to caller when an EOF is encountered on the input file.
*/

#define MAXLINE 512

str_cli(register FILE *fp, register int sockfd)
{
      int       n;
      char      sendline[MAXLINE], recvline[MAXLINE + 1];

      while (fgets(sendline, MAXLINE, fp) != NULL)
            {
            n = strlen(sendline);
            if (writen(sockfd, sendline, n) != n)
                  printf("str_cli: writen error on socket\n");

      /* Now read a line from the socket and write it to
       * our standard output.
*/

      n = readline(sockfd, recvline, MAXLINE);
      if (n < 0)
            printf("str_cli: readline error\n");

            fputs(recvline, stdout);
            }
}

/*
* Write n bytes to a descriptor.
* Use in place of write() when fd is a stream socket.
*/

int writen(register int fd, register char *ptr, register int nbytes)
{

int nleft, nwritten;

nleft = nbytes;
while (nleft > 0)
      {
      nwritten = write(fd, ptr, nleft);
      if (nwritten <= 0)
            return(nwritten);            /* error */

      nleft -= nwritten;
      ptr += nwritten;
      }
      return(nbytes - nleft);
}

/*
* Read a line from a descriptor. Read the line one byte at a time,
* looking for the new line. We store the newline in the buffer,
* then follow it with a null. We return the number of characters up
* but not including the null.
*/

int readline(int fd, char *ptr, int maxlen)
{
      int n, rc;
      char c;

      for (n = 1; n < maxlen; n++)
            {
            if ((rc = read(fd, &c, 1)) == 1)
                  {
                  *ptr++ = c;
                  if (c == '\n')
                        break;
                  }
            else if (rc == 0)
                        {
                        if (n == 1)
                              return(0);      /* EOF, no data read */
                        else
                              break;            /* EOF, some data read */
                        }
            else
                  return(-1);                  /* error */
            }
            *ptr = 0;
            return(n);
}

dhm

Sorry for the delay again...out of town this time. I'll try to get back to this question as soon as I catch up...

ASKER CERTIFIED SOLUTION

dhm

membership

This solution is only available to members.

To access this solution, you must be a member of Experts Exchange.

Start Free Trial

dhm

BTW, perror() just prints out an error message. It's handy, because it'll turn an error number into a string that describes the error, but basically it does nothing more than your "printf", so even if you had the function, it wouldn't have helped. You still have to do *something* when you detect an error, and the easiest thing to do is simply to exit(). A real daemon would have to do more, of course -- figure out what the problem is, try to solve it, and make sure that its error messages go somewhere for the sysadmin to use in troubleshooting.

The things Stevens is concerned about in his daemon_start() function are:

* The daemon should not have any files open (including the process' current directory) because open files will prevent the sys admin from unmounting the filesystem if he needs to.

* The daemon shouldn't be connected to a terminal, so that it doesn't receive signals if the user types ^C or logs off or whatever.

* The daemon has to clean up after any children it starts so that zombies don't accumulate.

sevrin

ASKER

dhm,

Thanks for this. I think you've answered my question, actually. The file included by both server and client programs is "uniks.h", which is as follows:

/*
* Definitions for UNIX domain stream and datagram client/server programs.
*/

#include <stdio.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>

#define UNIXSTR_PATH "./s.unixstr"

char *pname;

This shows that the path is not a full path, and as you say, the daemon_start function wants to change to the root directory. I think that changing from a partial to an absolute path is the answer Stevens wanted. So if you'll send the "Question answered" message to me again, I'll give you the 150 points.

You seem to be right, too, when you say the server isn't running. However, I can't see how to correct this. In the pre-daemon_start version of the server, it would create a file called s.unixstr, and as long as that file was present it was not possible to run the server again. However, the daemon_start version of the server program doesn't appear to create this file. As I say, you more than deserve the points for answering my question, but I'd be grateful if you could drop a line or two explaining what might be going on.

Incidentally, I have another question at Experts Exchange, and I'd be very grateful if you could cast your eye over that as well. I suspect it's a much simpler question than the one you've been helping me with, but another person keeps "answering" it, and he can't seem to understand the point I'm trying to make. The title of the question is caddr_t.

Thanks,

David K.

dhm

Cool, the relative path is probably what's screwing you up. When you're in the root directory, the server probably can't create the socket because of directory permissions, so I'd guess that's why the server isn't running (it tries to bind() the address but fails, so it exits.) Something else to keep in mind is that the socket is never deleted, so you'll only be able to run the server once at most, unless you delete the socket by hand. That might be a useful improvement: to do an "unlink(UNIXSTR_PATH)" before you try to create/bind the socket.

This question is still in "answer pending" mode, so there should be an "evaluate answer" section when you call it up.

I'll take a look at the other question, too.

sevrin

ASKER

Please look out for any more questions I have!

David K.