On Monday 19 of March 2012, Michael Meeks wrote:
On Mon, 2012-03-19 at 07:33 +0100, Lubos Lunak wrote:Oh, I see. I've already noticed this myself, and that's a good explanation for Voreppe's (lack of) builds. That's a rather bad bug for tinderbox builds, and we really could use a tinderbox watching over our commits breaking the MSVC build (I think I fixed 5 MSVC regressions last week at the very least).Yep - perhaps a re-boot-box-and-restart-build-after-4-hours of no watchdog ping or something ? :-)
Nah, so crude :). I've written a make watchdog, it's currently being tested on the Win-x86_6-fast tinderbox to see how it works in practice.
Anyhow - glad to see you Windows-ised :-)
Did anyone say I was staying 8-O ? -- Lubos Lunak l.lunak@suse.cz
TIMEOUT=1000
CXX=g++
CXXFLAGS=-O2
.PHONY: clean test_clean all test perform_test
all:
$(CXX) -Wall $(CXXFLAGS) makewatchdog.cpp -o makewatchdog
test: all test_clean
./makewatchdog log 20 2 $(MAKE) -j2 perform_test
perform_test: first second
first: output
cp output first
second: output2
cp output2 second
output:
( sleep $(TIMEOUT); echo kuk ) >output
output2:
( sleep $(TIMEOUT); echo kuk ) >output2
test_clean:
rm -f first second output output2 log
clean: test_clean
rm -f makewatchdog
/*
Copyright (c) 2012 Lubos Lunak <l.lunak@suse.cz>
Permission is hereby granted, free of charge, to any person obtaining a
copy of this software and associated documentation files (the "Software"),
to deal in the Software without restriction, including without limitation
the rights to use, copy, modify, merge, publish, distribute, sublicense,
and/or sell copies of the Software, and to permit persons to whom the
Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE.
*/
#include <algorithm>
#include <dirent.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <string.h>
#include <sys/wait.h>
#include <time.h>
#include <unistd.h>
#include <vector>
//#define DEBUG
using namespace std;
static int usage( const char* argv0 )
{
printf( "Usage: %s [outputFile] [timeout] [maxAttempts] [make command...]\n", argv0 );
return 0;
}
const int FAILURE = 3; // do not use 1 or 2 (check exit values make uses)
#define NAME "Make watchdog: "
struct ProcInfo
{
pid_t pid;
pid_t parent;
string cmdline;
};
typedef vector< ProcInfo > ProcInfoList;
static ProcInfoList findAllProcesses()
{
ProcInfoList procInfos;
DIR* dir = opendir( "/proc/" );
if( dir == NULL )
{
fprintf( stderr, NAME "Cannot read /proc.\n" );
return procInfos;
}
while( dirent* entry = readdir( dir ))
{
char buf[ 16384 ];
ProcInfo procInfo;
procInfo.pid = atoi( entry->d_name );
if( procInfo.pid == 0 )
continue;
if( FILE* f = fopen(( string( "/proc/" ) + entry->d_name + "/stat" ).c_str(), "r" ))
{
int size = fread( buf, 1, sizeof( buf ) - 1, f );
if( ferror( f ))
{
#ifdef DEBUG
fprintf( stderr, "Cannot read stat for %s\n", entry->d_name );
#endif
fclose( f );
continue;
}
buf[ size ] = '\0';
fclose( f );
procInfo.parent = 0;
if( const char* lparen = strchr( buf, '(' ))
if( const char* rparen = strrchr( lparen, ')' ))
sscanf( rparen + 2, "%*c %d", &procInfo.parent );
if( procInfo.parent == 0 )
continue;
if( procInfo.pid == procInfo.parent )
continue; // just in case
}
else
{
#ifdef DEBUG
fprintf( stderr, "Cannot open stat for %s\n", entry->d_name );
#endif
continue;
}
if( FILE* f = fopen(( string( "/proc/" ) + entry->d_name + "/cmdline" ).c_str(), "r" ))
{
*buf = '\0';
fscanf( f, "%s", buf );
fclose( f );
procInfo.cmdline = buf;
}
else
{ // not an error
#ifdef DEBUG
fprintf( stderr, "Cannot read cmdline for %s\n", entry->d_name );
#endif
}
// ok
procInfos.push_back( procInfo );
}
closedir( dir );
return procInfos;
}
static void findToKillRecursive( pid_t parent, const ProcInfoList& allProcesses, ProcInfoList*
toKill )
{
for( unsigned int i = 0;
i < allProcesses.size();
++i )
if( allProcesses[ i ].parent == parent )
{
findToKillRecursive( allProcesses[ i ].pid, allProcesses, toKill );
toKill->push_back( allProcesses[ i ] );
}
}
static vector< ProcInfo > findToKill( pid_t topParent )
{
ProcInfoList allProcesses = findAllProcesses();
ProcInfoList toKill;
findToKillRecursive( topParent, allProcesses, &toKill );
#ifdef DEBUG
bool found = false;
#endif
for( unsigned int i = 0;
i < allProcesses.size();
++i )
if( allProcesses[ i ].pid == topParent )
{
toKill.push_back( allProcesses[ i ] );
#ifdef DEBUG
found = true;
#endif
break;
}
#ifdef DEBUG
if( !found )
fprintf( stderr, "Top parent process info not found.\n" );
#endif
return toKill;
}
// I hope I got this one right
static int makeExitCode( int status )
{
if( WIFEXITED( status ))
return WEXITSTATUS( status );
if( WIFSIGNALED( status ))
return 128 + WTERMSIG( status );
return FAILURE;
}
enum KillStatus
{
SuccessfullExit, // exited cleanly
KilledInterrupted, // was interrupted (cleanly)
KilledForced // force killed (not clean)
};
static int killMake( pid_t pid, KillStatus* killed )
{
#ifdef DEBUG
fprintf( stderr, "Going to kill pid %d.\n", pid );
#endif
ProcInfoList toKill = findToKill( pid );
// SIGINT first
for( unsigned i = 0;
i < toKill.size();
++i )
kill( toKill[ i ].pid, SIGINT );
time_t t = time( NULL );
while( t + 10 > time( NULL ))
sleep( 2 ); // may get interrupted by a signal
int status;
bool pidHasFinished = false;
// need to clean up the top parent
if( waitpid( pid, &status, WNOHANG ) >= 0 )
{
pidHasFinished = true;
*killed = KilledInterrupted;
}
// now forcibly
for( unsigned i = 0;
i < toKill.size();
++i )
{
if( kill( toKill[ i ].pid, 0 ) == 0 ) // still alive?
{
*killed = KilledForced; // unclear cleanup
fprintf( stderr, NAME "Process %d not interrupted, forcibly killing.\n", toKill[ i
].pid );
fprintf( stderr, NAME "Cmdline: %s\n", toKill[ i ].cmdline.c_str());
kill( toKill[ i ].pid, SIGKILL );
}
}
if( !pidHasFinished )
waitpid( pid, &status, 0 );
return makeExitCode( status );
}
bool makeNonBlocking( int fd )
{
int options = fcntl( fd, F_GETFL );
if( options < 0 )
{
perror( NAME "fcntl( F_GETFL )" );
return false;
}
if( fcntl( fd, F_SETFL, O_NONBLOCK | O_CLOEXEC ) < 0 )
{
perror( NAME "fcntl( F_SETFL )" );
return false;
}
return true;
}
static int childPipeWrite;
static void childHandler( int )
{
char c = '\0';
write( childPipeWrite, &c, 1 );
}
static int watchMake( pid_t pid, KillStatus* killed, int timeout, int watchFd )
{
int pipeFd[ 2 ];
if( pipe( pipeFd ) < 0 )
{
perror( NAME "pipe()" );
return FAILURE;
}
childPipeWrite = pipeFd[ 1 ];
int childPipeRead = pipeFd[ 0 ];
if( !makeNonBlocking( childPipeRead ))
return FAILURE;
struct sigaction act;
act.sa_handler = childHandler;
sigemptyset( &act.sa_mask );
act.sa_flags = SA_NOCLDSTOP;
#ifdef SA_RESTART
act.sa_flags |= SA_RESTART;
#endif
sigaction( SIGCHLD, &act, NULL );
time_t lastActivity = time( NULL );
for(;;)
{
// select() cannot be used to watch for more data in watchFd, because it is a file
// and our reading position will be at its end for most of the time, meaning that
// select() will signal the fd is ready to read (i.e. eof)
#ifdef DEBUG
sleep( 2 );
#else
sleep( 10 );
#endif
char buf[ 1024 ];
if( read( childPipeRead, buf, 1 ) > 0 )
{
#ifdef DEBUG
fprintf( stderr, "Child exited\n" );
#endif
int status;
while( waitpid( pid, &status, 0 ) < 0 && errno == EINTR )
;
signal( SIGCHLD, SIG_DFL );
return makeExitCode( status );
}
if( read( watchFd, buf, sizeof( buf )) > 0 )
{
#ifdef DEBUG
fprintf( stderr, "Activity in output file.\n" );
#endif
while( read( watchFd, buf, sizeof( buf )) > 0 )
;
lastActivity = time( NULL );
}
else if( lastActivity + timeout < time( NULL ))
{ // timeout
#ifdef DEBUG
fprintf( stderr, "Activity timeout.\n" );
#endif
signal( SIGCHLD, SIG_DFL );
return killMake( pid, killed );
}
}
}
static int runMake( int argc, char** argv, KillStatus* killed, int timeout, int watchFd )
{
pid_t pid = fork();
switch( pid )
{
default: // parent
return watchMake( pid, killed, timeout, watchFd );
case 0: // child
close( watchFd );
execvp( argv[ 0 ], argv );
break;
case -1: // failure
perror( NAME "fork()" );
break;
}
fprintf( stderr, NAME "Make command invocation failed.\n" );
return FAILURE;
}
static int setupOutputFile( const char* outputFileName )
{
int outputFd = open( outputFileName, O_APPEND | O_CREAT | O_TRUNC | O_WRONLY, 0644 );
if( outputFd < 0 )
{
fprintf( stderr, NAME "Opening output file %s for writing failed: %s\n", outputFileName,
strerror( errno ));
return -1;
}
if( dup2( outputFd, STDOUT_FILENO ) < 0 )
{
perror( NAME "dup2()" );
return -1;
}
if( dup2( outputFd, STDERR_FILENO ) < 0 )
{
perror( NAME "dup2()" );
return -1;
}
close( outputFd );
int watchFd = open( outputFileName, O_RDONLY );
if( watchFd < 0 )
{
perror( NAME "open()" );
return -1;
}
if( !makeNonBlocking( watchFd ))
return -1;
return watchFd;
}
int main( int argc, char** argv )
{
if( argc < 4 )
return usage( argv[ 0 ] );
const char* outputFileName = argv[ 1 ];
int timeout = atoi( argv[ 2 ] );
int attempts = atoi( argv[ 3 ] );
int watchFd = setupOutputFile( outputFileName );
if( watchFd < 0 )
return FAILURE;
int exitcode = 0;
for( int attempt = 1;
attempt <= attempts;
++attempt )
{
KillStatus killed = SuccessfullExit;
exitcode = runMake( argc - 4, argv + 4, &killed, timeout, watchFd );
switch( killed )
{
case SuccessfullExit:
break;
case KilledInterrupted:
if( attempt == attempts )
fprintf( stderr, NAME "Error: Make command timed out, maximum number of
attempts reached,"
" failing, exit code %d.\n", exitcode );
else
fprintf( stderr, NAME "Error: Make command timed out, attempt %d/%d,
interrupting"
" and retrying.\n", attempt, attempts );
break;
case KilledForced:
fprintf( stderr, NAME "Error: Make command timed out, force killed, failing,"
" exit code %d\n", exitcode );
attempt = attempts + 1; // break out of the loop
break;
}
}
return exitcode;
}
Attachment:
windows.sh
Description: application/shellscript