Link to home
Create AccountLog in
Avatar of rbtt
rbttFlag for Trinidad and Tobago

asked on

LINUX Cluster - CMAN not starting on one node

We are running LINUX Cluster Suite 4 on  two nodes (Active/Passive).
As it stands now, one node is active while the other is having a problem joining the cluster.
One node has left the cluster and is unable to join when the clustat command is used the following is displayed:
[root@ttecprodnfs1 ~]# clustat
Segmentation fault

The server was then rebooted and the following message is being displayed:

CMAN 2.6.9-45.2 (built Jul 13 2006 11:42:36) installed
CMAN: Waiting to join or form a Linux-cluster
CMAN: sending membership request
CMAN: sending membership request
CMAN: sending membership request
CMAN: sending membership request
CMAN: sending membership request
CMAN: sending membership request
CMAN: sending membership request
CMAN: sending membership request

This node is unable to join the cluster
Avatar of arnold
arnold
Flag of United States of America image

Are you executing clustat on the current active node or on the node that you wish to join into the cluster?

Are there any errors on the current active node?  Are they the same version kernel, cman, etc.?

Which OS is being used, version?

you could run strace, truss depending on what is available on your system i.e
strace clustat and see where the failure is.
Avatar of rbtt

ASKER

The command is being executed on the node that is trying to join the cluster, the passive node.
There are no noticable errors on the active node.
The server has LINUX AS 4.4, the cluster suite is version 4.

strace command:
[root@ttecprodnfs1 ~]# strace clustat
execve("/usr/sbin/clustat", ["clustat"], [/* 20 vars */]) = 0
uname({sys="Linux", node="ttecprodnfs1", ...}) = 0
brk(0)                                  = 0x99d1000
access("/etc/ld.so.preload", R_OK)      = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=75309, ...}) = 0
old_mmap(NULL, 75309, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb7f70000
close(3)                                = 0
open("/lib/tls/libpthread.so.0", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0P\30\267"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=93985, ...}) = 0
old_mmap(0xb6d000, 70108, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xb6d000
old_mmap(0xb7b000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0xd000) = 0xb7b000
old_mmap(0xb7d000, 4572, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xb7d000
close(3)                                = 0
open("/lib/libdl.so.2", O_RDONLY)       = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\260\33"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=15324, ...}) = 0
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7f6f000
old_mmap(0xa51000, 12388, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xa51000
old_mmap(0xa53000, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1000) = 0xa53000
close(3)                                = 0
open("/usr/lib/libncurses.so.5", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\240!Q\003"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=879961, ...}) = 0
old_mmap(0x3504000, 264076, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x3504000
old_mmap(0x353c000, 32768, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x38000) = 0x353c000
old_mmap(0x3544000, 1932, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x3544000
close(3)                                = 0
open("/lib/tls/libc.so.6", O_RDONLY)    = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\320\216"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=1454802, ...}) = 0
old_mmap(0x924000, 1223900, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x924000
old_mmap(0xa49000, 16384, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x124000) = 0xa49000
old_mmap(0xa4d000, 7388, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0xa4d000
close(3)                                = 0
old_mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0xb7f6e000
mprotect(0xa49000, 4096, PROT_READ)     = 0
mprotect(0x91b000, 4096, PROT_READ)     = 0
set_thread_area({entry_number:-1 -> 6, base_addr:0xb7f6e6c0, limit:1048575, seg_32bit:1, contents:0, read_exec_only:0, limit_in_pages:1, seg_not_present:0, useable:1}) = 0
munmap(0xb7f70000, 75309)               = 0
set_tid_address(0xb7f6e708)             = 8438
rt_sigaction(SIGRTMIN, {0xb71380, [], SA_RESTORER|SA_SIGINFO, 0xb78890}, NULL, 8) = 0
rt_sigaction(SIGRT_1, {0xb713f0, [], SA_RESTORER|SA_RESTART|SA_SIGINFO, 0xb78890}, NULL, 8) = 0
rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0
getrlimit(RLIMIT_STACK, {rlim_cur=10240*1024, rlim_max=RLIM_INFINITY}) = 0
_sysctl({{CTL_KERN, KERN_VERSION}, 2, 0xbfe42d6c, 35, (nil), 0}) = 0
open("/lib/magma", O_RDONLY|O_NONBLOCK|O_LARGEFILE|O_DIRECTORY) = 3
fstat64(3, {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
fcntl64(3, F_SETFD, FD_CLOEXEC)         = 0
brk(0)                                  = 0x99d1000
brk(0x99f3000)                          = 0x99f3000
getdents64(3, /* 4 entries */, 4096)    = 120
getdents64(3, /* 0 entries */, 4096)    = 0
lseek(3, 0, SEEK_SET)                   = 0
getdents64(3, /* 4 entries */, 4096)    = 120
getdents64(3, /* 0 entries */, 4096)    = 0
close(3)                                = 0
stat64("/lib/magma/.", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
stat64("/lib/magma/..", {st_mode=S_IFDIR|0755, st_size=4096, ...}) = 0
stat64("/lib/magma/magma_gulm.so", {st_mode=S_IFREG|0755, st_size=11076, ...}) = 0
futex(0xa54060, FUTEX_WAKE, 2147483647) = 0
open("/lib/magma/magma_gulm.so", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\254\r\0"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=11076, ...}) = 0
old_mmap(NULL, 14008, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x881000
old_mmap(0x884000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x2000) = 0x884000
close(3)                                = 0
open("/etc/ld.so.cache", O_RDONLY)      = 3
fstat64(3, {st_mode=S_IFREG|0644, st_size=75309, ...}) = 0
old_mmap(NULL, 75309, PROT_READ, MAP_PRIVATE, 3, 0) = 0xb7f70000
close(3)                                = 0
open("/usr/lib/libgulm.so.1", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0t\21\0\000"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0644, st_size=59755, ...}) = 0
old_mmap(NULL, 21956, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7b4000
old_mmap(0x7b9000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x4000) = 0x7b9000
close(3)                                = 0
munmap(0xb7f70000, 75309)               = 0
gettid()                                = 8438
socket(PF_INET6, SOCK_STREAM, IPPROTO_IP) = 3
connect(3, {sa_family=AF_INET6, sin6_port=htons(40040), inet_pton(AF_INET6, "::1", &sin6_addr), sin6_flowinfo=0, sin6_scope_id=0}, 28) = -1 ECONNREFUSED (Connection refused)
close(3)                                = 0
munmap(0x881000, 14008)                 = 0
munmap(0x7b4000, 21956)                 = 0
stat64("/lib/magma/magma_sm.so", {st_mode=S_IFREG|0755, st_size=20332, ...}) = 0
open("/lib/magma/magma_sm.so", O_RDONLY) = 3
read(3, "\177ELF\1\1\1\0\0\0\0\0\0\0\0\0\3\0\3\0\1\0\0\0\314\20"..., 512) = 512
fstat64(3, {st_mode=S_IFREG|0755, st_size=20332, ...}) = 0
old_mmap(NULL, 23268, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0xf31000
old_mmap(0xf36000, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x4000) = 0xf36000
close(3)                                = 0
socket(0x1e /* PF_??? */, SOCK_DGRAM, 3) = 3
rt_sigaction(SIGINT, {0x8049888, [INT], SA_RESTORER|SA_RESTART, 0x94b898}, {SIG_DFL}, 8) = 0
rt_sigaction(SIGTERM, {0x8049888, [TERM], SA_RESTORER|SA_RESTART, 0x94b898}, {SIG_DFL}, 8) = 0
ioctl(3, 0x7805, 0)                     = 0
ioctl(3, 0x80087803, 0)                 = -1 ENOENT (No such file or directory)
getuid32()                              = 0
ioctl(3, 0x80087803, 0)                 = -1 ENOENT (No such file or directory)
socket(PF_FILE, SOCK_STREAM, 0)         = 4
connect(4, {sa_family=AF_FILE, path="/var/run/cluster/ccsd.sock"}, 110) = 0
write(4, "\1\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0", 20) = 20
read(4, "\1\0\0\0\0\0\0\0\0\0\0\0\221\377\377\377\0\0\0\0", 20) = 20
close(4)                                = 0
ioctl(3, 0x80087803, 0)                 = -1 ENOENT (No such file or directory)
--- SIGSEGV (Segmentation fault) @ 0 (0) ---
+++ killed by SIGSEGV +++
Process 8438 detached
ASKER CERTIFIED SOLUTION
Avatar of arnold
arnold
Flag of United States of America image

Link to home
membership
Create an account to see this answer
Signing up is free. No credit card required.
Create Account
Avatar of rbtt

ASKER

On the passive node I am trying to stop the CMAN service, but this operation is failing.
I also tried rebooting the passive node, again the node didnot join the cluster - the CMAN service failed.

Shouldn't the passive node be able to join the cluster after a reboot?
The problem is that the CMAN service cannot be started on the passive node.

Is there a way I can force kill the CMAN service for a restart.
To stop cman from starting on boot, you can use chkconfig.
http://linux.die.net/man/8/chkconfig
chkconfig --list | grep cman and repeat the same for the related apps.
then run;
chkconfig cman off
and repeat for the others.
If cman and other are not listed in the chkconfig, you would need to search the /etc/rcx.d files to see where it start.

Another option is rename the cluster.conf so there is not /etc/cluster.conf on the passive node.
On the active node,  remove the passive node from the cluster.
Reboot the passive node.
On the active node readd the node.
Provided they have kernel, and cluster suite of identical versions.  Rebooting the newly added node or starting the services should get the node back into the cluster.

It is also possible that on your passive node the configuration is corrupt.  Copying the configuration from the current active node might do the job as well.