Friday, January 22, 2010

Oracle HAS startup failed, After stop syslogd

Just something strange or ..., Oracle High Availability Services startup failed. After stopped syslogd. Hope someone explain this issue.
Using 11gR2 Grid + ASM LIBS + Loop devices.

Let me show:
# /etc/init.d/syslog stop
Shutting down kernel logger: [ OK ]
Shutting down system logger: [ OK ]
# /oracle/grid2/bin/crsctl start has
CRS-4124: Oracle High Availability Services startup failed.
CRS-4000: Command Start failed, or completed with errors.
# /oracle/grid2/bin/crsctl check has
CRS-4639: Could not contact Oracle High Availability Services
# /etc/init.d/syslog start
Starting system logger: [ OK ]
Starting kernel logger: [ OK ]
# /oracle/grid2/bin/crsctl start has
CRS-4123: Oracle High Availability Services has been started.
# /oracle/grid2/bin/crsctl check has
CRS-4638: Oracle High Availability Services is online
and then Check more:
Case: syslogd stopped and HAS startup failed
# /etc/init.d/syslog status
syslogd is stopped
klogd is stopped

# strace /oracle/grid2/bin/crsctl start has
execve("/oracle/grid2/bin/crsctl", ["/oracle/grid2/bin/crsctl", "start", "has"], [/* 23 vars */]) = 0
brk(0) = 0xa684000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b1c9c71f000
uname({sys="Linux", node="RHEL5-TEST", ...}) = 0
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=116257, ...}) = 0
mmap(NULL, 116257, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2b1c9c72000
.
.
.
socket(PF_FILE, SOCK_STREAM, 0) = 4
connect(4, {sa_family=AF_FILE, path="/var/tmp/.oracle/sOHASD_UI_SOCKET"...}, 110) = -1 ECONNREFUSED (Connection refused)
access("/var/tmp/.oracle/sOHASD_UI_SOCKET", F_OK) = 0
close(4) = 0
close(3) = 0
open("/oracle/grid2/crs/mesg/crsus.msb", O_RDONLY) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
lseek(3, 0, SEEK_SET) = 0
read(3, "\25\23\"\1\23\3\t\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 256) = 256
lseek(3, 512, SEEK_SET) = 512
read(3, "\21'\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 1024, SEEK_SET) = 1024
read(3, "\226\0\245\0\262\0\307\0\330\0\345\0\364\0\376\0\7\1\21\0013\1;\1D\1\231\1\303\1\370\1"..., 512) = 512
lseek(3, 99840, SEEK_SET) = 99840
read(3, "\t\0\37\22\0\0>\0 \22\0\0q\0!\22\0\0\244\0\"\22\0\0\313\0#\22\1\0\363\0"..., 512) = 512
lseek(3, 115712, SEEK_SET) = 115712
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116224, SEEK_SET) = 116224
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116736, SEEK_SET) = 116736
read(3, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
close(3) = 0
nanosleep({5, 0}, {0, 422276128}) = 0
open("/oracle/grid2/crs/mesg/crsus.msb", O_RDONLY) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
lseek(3, 0, SEEK_SET) = 0
read(3, "\25\23\"\1\23\3\t\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 256) = 256
lseek(3, 512, SEEK_SET) = 512
read(3, "\21'\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 1024, SEEK_SET) = 1024
read(3, "\226\0\245\0\262\0\307\0\330\0\345\0\364\0\376\0\7\1\21\0013\1;\1D\1\231\1\303\1\370\1"..., 512) = 512
lseek(3, 75264, SEEK_SET) = 75264
read(3, "\3\0\32\20\3\0\32\0\33\20\0\0\30\1\34\20\0\0K\1\0\0\0\0|\1 crsc"..., 512) = 512
lseek(3, 115712, SEEK_SET) = 115712
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116224, SEEK_SET) = 116224
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116736, SEEK_SET) = 116736
read(3, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
close(3) = 0
fstat(1, {st_mode=S_IFREG|0644, st_size=249965, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaaaade2000
open("/oracle/grid2/crs/mesg/crsus.msb", O_RDONLY) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
lseek(3, 0, SEEK_SET) = 0
read(3, "\25\23\"\1\23\3\t\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 256) = 256
lseek(3, 512, SEEK_SET) = 512
read(3, "\21'\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 1024, SEEK_SET) = 1024
read(3, "\226\0\245\0\262\0\307\0\330\0\345\0\364\0\376\0\7\1\21\0013\1;\1D\1\231\1\303\1\370\1"..., 512) = 512
lseek(3, 52224, SEEK_SET) = 52224
read(3, "\10\0\353\n\1\0008\0\354\n\1\0\212\0\355\n\3\0\320\0\356\n\0\0\24\1\240\17\0\0q\1"..., 512) = 512
lseek(3, 115712, SEEK_SET) = 115712
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 115712, SEEK_SET) = 115712
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116224, SEEK_SET) = 116224
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116736, SEEK_SET) = 116736
read(3, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
close(3) = 0
write(1, "CRS-4124: Oracle High Availabili"..., 118CRS-4124: Oracle High Availability Services startup failed.
CRS-4000: Command Start failed, or completed with errors.
) = 118
munmap(0x2aaaaacd7000, 1091040) = 0
futex(0x19214b84, FUTEX_CMP_REQUEUE_PRIVATE, 1, 2147483647, 0x19214ab0, 4) = 1
futex(0x19214ab0, FUTEX_WAKE_PRIVATE, 1) = 1
brk(0x192d7000) = 0x192d7000
munmap(0x2b183c79f000, 4198400) = 0
munmap(0x2b183c77c000, 143360) = 0
munmap(0x2b183c759000, 143360) = 0
munmap(0x2b183c736000, 143360) = 0
exit_group(1) = ?

Case: syslogd start and HAS startup... -)
# /etc/init.d/syslog status
syslogd (pid 17175) is running...
klogd (pid 17178) is running...

# strace /oracle/grid2/bin/crsctl start has
execve("/oracle/grid2/bin/crsctl", ["/oracle/grid2/bin/crsctl", "start", "has"], [/* 23 vars */]) = 0
brk(0) = 0x17268000
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2b0b8bab9000
uname({sys="Linux", node="RHEL5-TEST", ...}) = 0
access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory)
open("/etc/ld.so.cache", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0644, st_size=116257, ...}) = 0
mmap(NULL, 116257, PROT_READ, MAP_PRIVATE, 3, 0) = 0x2b0b8baba000
.
.
.
open("/oracle/grid2/auth/ohasd/rhel5-test/A5291056/8f9b98b1", O_WRONLY|O_CREAT|O_EXCL, 0644) = 5
fchmod(5, 0644) = 0
write(5, "c\200\n#", 4) = 4
close(5) = 0
write(4, "4\0\0\0\2\0\2\0\1\1\1\1\3\0\0\0\0\0\0\0\1\0\0\0\362\2161\1\0\0\0\0"..., 52) = 52
write(4, "8\0\0\0\3\0\2\0\1\1\1\1\4\0\0\0\0\0\0\0\1\0\0\0\362\2161\1\0\0\0\0"..., 56) = 56
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499395
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499395
poll([{fd=3, events=POLLIN|POLLRDNORM}, {fd=4, events=POLLIN|POLLRDNORM}], 2, -1) = 1 ([{fd=4, revents=POLLIN|POLLRDNORM}])
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499395
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499395
read(4, "8\0\0\0\3\0\2\0\1\1\1\1\3\0\0\0\0\0\0\0\1\0\0\0\36\2251\1\0\0\0\0"..., 32768) = 56
write(4, "\237\1\0\0PC\0\0\v\2\2\0\5\0\0\0\0\0\0\0\1\0\0\0\362\2161\1\0\0\0\0"..., 415) = 415
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499395
poll([{fd=3, events=POLLIN|POLLRDNORM}, {fd=4, events=POLLIN|POLLRDNORM}], 2, -1) = 1 ([{fd=4, revents=POLLIN|POLLRDNORM}])
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499500
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499500
read(4, "\211\2\0\0PC\0\0\v\2\2\0\4\0\0\0\0\0\0\0\1\0\0\0\36\2251\1\0\0\0\0"..., 32768) = 649
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499500
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499500
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499500
write(4, "0\0\0\0\2\0\20\0\1\0\1\0\0\0\0\0\0\0\0\0\0\0\0\0\362\2161\1\0\0\0\0"..., 48) = 48
fcntl(4, F_GETFL) = 0x802 (flags O_RDWR|O_NONBLOCK)
fcntl(4, F_SETFL, O_RDWR) = 0
close(4) = 0
close(3) = 0
open("/oracle/grid2/crs/mesg/crsus.msb", O_RDONLY) = 3
fcntl(3, F_SETFD, FD_CLOEXEC) = 0
lseek(3, 0, SEEK_SET) = 0
read(3, "\25\23\"\1\23\3\t\t\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 256) = 256
lseek(3, 512, SEEK_SET) = 512
read(3, "\21'\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 1024, SEEK_SET) = 1024
read(3, "\226\0\245\0\262\0\307\0\330\0\345\0\364\0\376\0\7\1\21\0013\1;\1D\1\231\1\303\1\370\1"..., 512) = 512
lseek(3, 75264, SEEK_SET) = 75264
read(3, "\3\0\32\20\3\0\32\0\33\20\0\0\30\1\34\20\0\0K\1\0\0\0\0|\1 crsc"..., 512) = 512
lseek(3, 115712, SEEK_SET) = 115712
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116224, SEEK_SET) = 116224
read(3, "\377\377\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
lseek(3, 116736, SEEK_SET) = 116736
read(3, "\0\0\0\0\0\0\10\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 512) = 512
close(3) = 0
fstat(1, {st_mode=S_IFREG|0644, st_size=69208, ...}) = 0
mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x2aaaaade2000
write(1, "CRS-4123: Oracle High Availabili"..., 62CRS-4123: Oracle High Availability Services has been started.
) = 62
munmap(0x2aaaaacd7000, 1091040) = 0
futex(0xd178b84, FUTEX_CMP_REQUEUE_PRIVATE, 1, 2147483647, 0xd178ab0, 4) = 1
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499500
nanosleep({0, 100000000}, {140734224759904, 4314995000}) = 0
times({tms_utime=2, tms_stime=2, tms_cutime=0, tms_cstime=0}) = 431499510
brk(0xd23b000) = 0xd23b000
munmap(0x2b3f70f01000, 4198400) = 0
munmap(0x2b3f70ede000, 143360) = 0
munmap(0x2b3f70ebb000, 143360) = 0
munmap(0x2b3f70e98000, 143360) = 0
exit_group(0)

However, I hope to know idea and some comment -)

6 comments:

Coskan Gundogar said...

looks like HAS uses the syslogd doesn't it

In your tests HAS was always down prior to syslogd stop, Did you also check already running Oracle HAS when you stop syslogd. I wonder if it depends on syslogd to stay alive or it continues to stay alive when you just stop syslogd but keep has

Surachart Opun said...

To Coskan,

I tested stop syslogd while Oracle HAS running -> no problem.

I just had the problem -> when syslogd stopped and then start Oracle HAS

-)

Coskan Gundogar said...

Just guessing !!!

Can you check syslogs when you start HAS. If it leaves a log to the syslogs when it starts up then this might be about it.

Also can you test to stop sunning HAS when syslogd already stopped (if my initial theory is true then it needs logging from service stop as well and might raise another error because of the stopped syslogd )

as you say just for fun :)

Surachart Opun said...

Thank you for your idea -)

Stop HAS (syslogd stopped) -> no problem.

I hope I'll have time to test this issue again.

Surachart Opun said...

To Coskan,

I asked this issue on communities.oracle.com (and thank you). they explained... when stopped syslogd and start HAS failed...
- /dev/log is not a socket at $GRID/bin/crswrapexece.pl


-)

Surachart Opun said...

This appears to be fixed in (unpublished) bug.

Oracle Support will add this fix to the list to go into future 11.2 clusterware bundle patches.

Thank You Oracle support.