From 65a7370db1e967025e45225a97d0b4585c341480 Mon Sep 17 00:00:00 2001 From: Jonathan Coates Date: Wed, 4 May 2022 12:30:12 +0100 Subject: [PATCH] Rethink how computer timeouts are handled Previously we would compute the current timeout flags every 128 instructions of the Lua machine. While computing these flags is _relatively_ cheap (just get the current time), it still all adds up. Instead we now set the timeout flags from the computer monitor/watchdog thread. This does mean that the monitor thread needs to wake up more often[^1] _if the queue is full_, otherwise we can sleep for 100ms as before. This does mean that pausing is a little less accurate (can technically take up 2*period instead). This isn't great, but in practice it seems fine - I've not noticed any playability issues. This offers a small (but measurable!) boost to computer performance. [^1]: We currently sleep for scaledPeriod, but we could choose to do less. --- .../core/computer/ComputerThread.java | 219 +++++++++++------- .../core/computer/TimeoutState.java | 6 +- .../core/lua/CobaltLuaMachine.java | 39 ++-- .../core/computer/ComputerThreadTest.java | 17 +- 4 files changed, 159 insertions(+), 122 deletions(-) diff --git a/src/main/java/dan200/computercraft/core/computer/ComputerThread.java b/src/main/java/dan200/computercraft/core/computer/ComputerThread.java index 028b7b056..d557fabe5 100644 --- a/src/main/java/dan200/computercraft/core/computer/ComputerThread.java +++ b/src/main/java/dan200/computercraft/core/computer/ComputerThread.java @@ -13,6 +13,7 @@ import javax.annotation.Nullable; import java.util.TreeSet; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.locks.Condition; import java.util.concurrent.locks.LockSupport; @@ -49,11 +50,11 @@ import static dan200.computercraft.core.computer.TimeoutState.TIMEOUT; public final class ComputerThread { /** - * How often the computer thread monitor should run, in milliseconds. + * How often the computer thread monitor should run. * * @see Monitor */ - private static final int MONITOR_WAKEUP = 100; + private static final long MONITOR_WAKEUP = TimeUnit.MILLISECONDS.toNanos( 100 ); /** * The target latency between executing two tasks on a single machine. @@ -76,6 +77,13 @@ public final class ComputerThread */ private static final long LATENCY_MAX_TASKS = DEFAULT_LATENCY / DEFAULT_MIN_PERIOD; + /** + * Time difference between reporting crashed threads. + * + * @see TaskRunner#reportTimeout(ComputerExecutor, long) + */ + private static final long REPORT_DEBOUNCE = TimeUnit.SECONDS.toNanos( 1 ); + /** * Lock used for modifications to the array of current threads. */ @@ -102,6 +110,8 @@ public final class ComputerThread private static final ReentrantLock computerLock = new ReentrantLock(); private static final Condition hasWork = computerLock.newCondition(); + private static final AtomicInteger idleWorkers = new AtomicInteger( 0 ); + private static final Condition monitorWakeup = computerLock.newCondition(); /** * Active queues to execute. @@ -135,7 +145,7 @@ public final class ComputerThread if( runners == null ) { - // TODO: Change the runners length on config reloads + // TODO: Update this on config reloads. Or possibly on world restarts? runners = new TaskRunner[ComputerCraft.computerThreads]; // latency and minPeriod are scaled by 1 + floor(log2(threads)). We can afford to execute tasks for @@ -227,9 +237,14 @@ public final class ComputerThread executor.virtualRuntime = Math.max( newRuntime, executor.virtualRuntime ); + boolean wasBusy = isBusy(); // Add to the queue, and signal the workers. computerQueue.add( executor ); hasWork.signal(); + + // If we've transitioned into a busy state, notify the monitor. This will cause it to sleep for scaledPeriod + // instead of the longer wakeup duration. + if( !wasBusy && isBusy() ) monitorWakeup.signal(); } finally { @@ -346,6 +361,17 @@ public final class ComputerThread return !computerQueue.isEmpty(); } + /** + * Check if we have more work queued than we have capacity for. Effectively a more fine-grained version of + * {@link #hasPendingWork()}. + * + * @return If the computer threads are busy. + */ + private static boolean isBusy() + { + return computerQueue.size() > idleWorkers.get(); + } + /** * Observes all currently active {@link TaskRunner}s and terminates their tasks once they have exceeded the hard * abort limit. @@ -357,76 +383,93 @@ public final class ComputerThread @Override public void run() { - try + while( true ) { - while( true ) + computerLock.lock(); + try { - Thread.sleep( MONITOR_WAKEUP ); + // If we've got more work than we have capacity for it, then we'll need to pause a task soon, so + // sleep for a single pause duration. Otherwise we only need to wake up to set the soft/hard abort + // flags, which are far less granular. + monitorWakeup.awaitNanos( isBusy() ? scaledPeriod() : MONITOR_WAKEUP ); + } + catch( InterruptedException e ) + { + ComputerCraft.log.error( "Monitor thread interrupted. Computers may behave very badly!", e ); + break; + } + finally + { + computerLock.unlock(); + } - TaskRunner[] currentRunners = ComputerThread.runners; - if( currentRunners != null ) + checkRunners(); + } + } + + private static void checkRunners() + { + TaskRunner[] currentRunners = ComputerThread.runners; + if( currentRunners == null ) return; + + for( int i = 0; i < currentRunners.length; i++ ) + { + TaskRunner runner = currentRunners[i]; + // If we've no runner, skip. + if( runner == null || runner.owner == null || !runner.owner.isAlive() ) + { + if( !running ) continue; + + // Mark the old runner as dead and start a new one. + ComputerCraft.log.warn( "Previous runner ({}) has crashed, restarting!", + runner != null && runner.owner != null ? runner.owner.getName() : runner ); + if( runner != null ) runner.running = false; + runnerFactory.newThread( runners[i] = new TaskRunner() ).start(); + } + + // If the runner has no work, skip + ComputerExecutor executor = runner.currentExecutor.get(); + if( executor == null ) continue; + + // Refresh the timeout state. Will set the pause/soft timeout flags as appropriate. + executor.timeout.refresh(); + + // If we're still within normal execution times (TIMEOUT) or soft abort (ABORT_TIMEOUT), + // then we can let the Lua machine do its work. + long afterStart = executor.timeout.nanoCumulative(); + long afterHardAbort = afterStart - TIMEOUT - ABORT_TIMEOUT; + if( afterHardAbort < 0 ) continue; + + // Set the hard abort flag. + executor.timeout.hardAbort(); + executor.abort(); + + if( afterHardAbort >= ABORT_TIMEOUT * 2 ) + { + // If we've hard aborted and interrupted, and we're still not dead, then mark the runner + // as dead, finish off the task, and spawn a new runner. + runner.reportTimeout( executor, afterStart ); + runner.running = false; + runner.owner.interrupt(); + + ComputerExecutor thisExecutor = runner.currentExecutor.getAndSet( null ); + if( thisExecutor != null ) afterWork( runner, executor ); + + synchronized( threadLock ) { - for( int i = 0; i < currentRunners.length; i++ ) + if( running && runners.length > i && runners[i] == runner ) { - TaskRunner runner = currentRunners[i]; - // If we've no runner, skip. - if( runner == null || runner.owner == null || !runner.owner.isAlive() ) - { - if( !running ) continue; - - // Mark the old runner as dead and start a new one. - ComputerCraft.log.warn( "Previous runner ({}) has crashed, restarting!", - runner != null && runner.owner != null ? runner.owner.getName() : runner ); - if( runner != null ) runner.running = false; - runnerFactory.newThread( runners[i] = new TaskRunner() ).start(); - } - - // If the runner has no work, skip - ComputerExecutor executor = runner.currentExecutor.get(); - if( executor == null ) continue; - - // If we're still within normal execution times (TIMEOUT) or soft abort (ABORT_TIMEOUT), - // then we can let the Lua machine do its work. - long afterStart = executor.timeout.nanoCumulative(); - long afterHardAbort = afterStart - TIMEOUT - ABORT_TIMEOUT; - if( afterHardAbort < 0 ) continue; - - // Set the hard abort flag. - executor.timeout.hardAbort(); - executor.abort(); - - if( afterHardAbort >= ABORT_TIMEOUT * 2 ) - { - // If we've hard aborted and interrupted, and we're still not dead, then mark the runner - // as dead, finish off the task, and spawn a new runner. - timeoutTask( executor, runner.owner, afterStart ); - runner.running = false; - runner.owner.interrupt(); - - ComputerExecutor thisExecutor = runner.currentExecutor.getAndSet( null ); - if( thisExecutor != null ) afterWork( runner, executor ); - - synchronized( threadLock ) - { - if( running && runners.length > i && runners[i] == runner ) - { - runnerFactory.newThread( currentRunners[i] = new TaskRunner() ).start(); - } - } - } - else if( afterHardAbort >= ABORT_TIMEOUT ) - { - // If we've hard aborted but we're still not dead, dump the stack trace and interrupt - // the task. - timeoutTask( executor, runner.owner, afterStart ); - runner.owner.interrupt(); - } + runnerFactory.newThread( currentRunners[i] = new TaskRunner() ).start(); } } } - } - catch( InterruptedException ignored ) - { + else if( afterHardAbort >= ABORT_TIMEOUT ) + { + // If we've hard aborted but we're still not dead, dump the stack trace and interrupt + // the task. + runner.reportTimeout( executor, afterStart ); + runner.owner.interrupt(); + } } } } @@ -441,6 +484,7 @@ public final class ComputerThread private static final class TaskRunner implements Runnable { Thread owner; + long lastReport = Long.MIN_VALUE; volatile boolean running = true; final AtomicReference currentExecutor = new AtomicReference<>(); @@ -460,6 +504,7 @@ public final class ComputerThread computerLock.lockInterruptibly(); try { + idleWorkers.incrementAndGet(); while( computerQueue.isEmpty() ) hasWork.await(); executor = computerQueue.pollFirst(); assert executor != null : "hasWork should ensure we never receive null work"; @@ -467,6 +512,7 @@ public final class ComputerThread finally { computerLock.unlock(); + idleWorkers.decrementAndGet(); } } catch( InterruptedException ignored ) @@ -516,27 +562,32 @@ public final class ComputerThread } } } - } - private static void timeoutTask( ComputerExecutor executor, Thread thread, long time ) - { - if( !ComputerCraft.logComputerErrors ) return; - - StringBuilder builder = new StringBuilder() - .append( "Terminating computer #" ).append( executor.getComputer().getID() ) - .append( " due to timeout (running for " ).append( time * 1e-9 ) - .append( " seconds). This is NOT a bug, but may mean a computer is misbehaving. " ) - .append( thread.getName() ) - .append( " is currently " ) - .append( thread.getState() ); - Object blocking = LockSupport.getBlocker( thread ); - if( blocking != null ) builder.append( "\n on " ).append( blocking ); - - for( StackTraceElement element : thread.getStackTrace() ) + private void reportTimeout( ComputerExecutor executor, long time ) { - builder.append( "\n at " ).append( element ); - } + if( !ComputerCraft.logComputerErrors ) return; - ComputerCraft.log.warn( builder.toString() ); + // Attempt to debounce stack trace reporting, limiting ourselves to one every second. + long now = System.nanoTime(); + if( lastReport != Long.MIN_VALUE && now - lastReport - REPORT_DEBOUNCE <= 0 ) return; + lastReport = now; + + StringBuilder builder = new StringBuilder() + .append( "Terminating computer #" ).append( executor.getComputer().getID() ) + .append( " due to timeout (running for " ).append( time * 1e-9 ) + .append( " seconds). This is NOT a bug, but may mean a computer is misbehaving. " ) + .append( owner.getName() ) + .append( " is currently " ) + .append( owner.getState() ); + Object blocking = LockSupport.getBlocker( owner ); + if( blocking != null ) builder.append( "\n on " ).append( blocking ); + + for( StackTraceElement element : owner.getStackTrace() ) + { + builder.append( "\n at " ).append( element ); + } + + ComputerCraft.log.warn( builder.toString() ); + } } } diff --git a/src/main/java/dan200/computercraft/core/computer/TimeoutState.java b/src/main/java/dan200/computercraft/core/computer/TimeoutState.java index 875bf6788..80be47199 100644 --- a/src/main/java/dan200/computercraft/core/computer/TimeoutState.java +++ b/src/main/java/dan200/computercraft/core/computer/TimeoutState.java @@ -86,7 +86,7 @@ public final class TimeoutState /** * Recompute the {@link #isSoftAborted()} and {@link #isPaused()} flags. */ - public void refresh() + public synchronized void refresh() { // Important: The weird arithmetic here is important, as nanoTime may return negative values, and so we // need to handle overflow. @@ -153,7 +153,7 @@ public final class TimeoutState * * @see #nanoCumulative() */ - void pauseTimer() + synchronized void pauseTimer() { // We set the cumulative time to difference between current time and "nominal start time". cumulativeElapsed = System.nanoTime() - cumulativeStart; @@ -163,7 +163,7 @@ public final class TimeoutState /** * Resets the cumulative time and resets the abort flags. */ - void stopTimer() + synchronized void stopTimer() { cumulativeElapsed = 0; paused = softAbort = hardAbort = false; diff --git a/src/main/java/dan200/computercraft/core/lua/CobaltLuaMachine.java b/src/main/java/dan200/computercraft/core/lua/CobaltLuaMachine.java index 163c322d1..555466a20 100644 --- a/src/main/java/dan200/computercraft/core/lua/CobaltLuaMachine.java +++ b/src/main/java/dan200/computercraft/core/lua/CobaltLuaMachine.java @@ -452,24 +452,9 @@ public class CobaltLuaMachine implements ILuaMachine // We check our current pause/abort state every 128 instructions. if( (count = (count + 1) & 127) == 0 ) { - // If we've been hard aborted or closed then abort. if( timeout.isHardAborted() || state == null ) throw HardAbortError.INSTANCE; - - timeout.refresh(); - if( timeout.isPaused() ) - { - // Preserve the current state - isPaused = true; - oldInHook = ds.inhook; - oldFlags = di.flags; - - // Suspend the state. This will probably throw, but we need to handle the case where it won't. - di.flags |= FLAG_HOOKYIELD | FLAG_HOOKED; - LuaThread.suspend( ds.getLuaState() ); - resetPaused( ds, di ); - } - - handleSoftAbort(); + if( timeout.isPaused() ) handlePause( ds, di ); + if( timeout.isSoftAborted() ) handleSoftAbort(); } super.onInstruction( ds, di, pc ); @@ -478,13 +463,10 @@ public class CobaltLuaMachine implements ILuaMachine @Override public void poll() throws LuaError { - // If we've been hard aborted or closed then abort. LuaState state = CobaltLuaMachine.this.state; if( timeout.isHardAborted() || state == null ) throw HardAbortError.INSTANCE; - - timeout.refresh(); if( timeout.isPaused() ) LuaThread.suspendBlocking( state ); - handleSoftAbort(); + if( timeout.isSoftAborted() ) handleSoftAbort(); } private void resetPaused( DebugState ds, DebugFrame di ) @@ -498,11 +480,24 @@ public class CobaltLuaMachine implements ILuaMachine private void handleSoftAbort() throws LuaError { // If we already thrown our soft abort error then don't do it again. - if( !timeout.isSoftAborted() || thrownSoftAbort ) return; + if( thrownSoftAbort ) return; thrownSoftAbort = true; throw new LuaError( TimeoutState.ABORT_MESSAGE ); } + + private void handlePause( DebugState ds, DebugFrame di ) throws LuaError, UnwindThrowable + { + // Preserve the current state + isPaused = true; + oldInHook = ds.inhook; + oldFlags = di.flags; + + // Suspend the state. This will probably throw, but we need to handle the case where it won't. + di.flags |= FLAG_HOOKYIELD | FLAG_HOOKED; + LuaThread.suspend( ds.getLuaState() ); + resetPaused( ds, di ); + } } private static final class HardAbortError extends Error diff --git a/src/test/java/dan200/computercraft/core/computer/ComputerThreadTest.java b/src/test/java/dan200/computercraft/core/computer/ComputerThreadTest.java index 7901de7fa..70fcc3422 100644 --- a/src/test/java/dan200/computercraft/core/computer/ComputerThreadTest.java +++ b/src/test/java/dan200/computercraft/core/computer/ComputerThreadTest.java @@ -33,10 +33,7 @@ public class ComputerThreadTest FakeComputerManager.enqueue( computer, timeout -> { assertFalse( timeout.isSoftAborted(), "Should not start soft-aborted" ); - long delay = ConcurrentHelpers.waitUntil( () -> { - timeout.refresh(); - return timeout.isSoftAborted(); - } ); + long delay = ConcurrentHelpers.waitUntil( timeout::isSoftAborted ); assertThat( "Should be soft aborted", delay * 1e-9, closeTo( 7, 0.5 ) ); ComputerCraft.log.info( "Slept for {}", delay ); @@ -69,10 +66,7 @@ public class ComputerThreadTest { Computer computer = FakeComputerManager.create(); FakeComputerManager.enqueue( computer, timeout -> { - boolean didPause = ConcurrentHelpers.waitUntil( () -> { - timeout.refresh(); - return timeout.isPaused(); - }, 5, TimeUnit.SECONDS ); + boolean didPause = ConcurrentHelpers.waitUntil( timeout::isPaused, 5, TimeUnit.SECONDS ); assertFalse( didPause, "Machine shouldn't have paused within 5s" ); computer.shutdown(); @@ -90,11 +84,8 @@ public class ComputerThreadTest long budget = ComputerThread.scaledPeriod(); assertEquals( budget, TimeUnit.MILLISECONDS.toNanos( 25 ), "Budget should be 25ms" ); - long delay = ConcurrentHelpers.waitUntil( () -> { - timeout.refresh(); - return timeout.isPaused(); - } ); - assertThat( "Paused within 25ms", delay * 1e-9, closeTo( 0.025, 0.01 ) ); + long delay = ConcurrentHelpers.waitUntil( timeout::isPaused ); + assertThat( "Paused within 25ms", delay * 1e-9, closeTo( 0.03, 0.015 ) ); computer.shutdown(); return MachineResult.OK;