diff --git a/services/clsi/app.js b/services/clsi/app.js index f24ce5f924..6266c86b16 100644 --- a/services/clsi/app.js +++ b/services/clsi/app.js @@ -10,6 +10,7 @@ const Metrics = require('@overleaf/metrics') Metrics.initialize('clsi') const CompileController = require('./app/js/CompileController') +const ContentController = require('./app/js/ContentController') const Settings = require('settings-sharelatex') const logger = require('logger-sharelatex') logger.initialize('clsi') @@ -170,6 +171,15 @@ app.get( } ) +app.get( + '/project/:projectId/content/:contentId/:hash', + ContentController.getPdfRange +) +app.get( + '/project/:projectId/user/:userId/content/:contentId/:hash', + ContentController.getPdfRange +) + app.get('/project/:project_id/build/:build_id/output/*', function ( req, res, diff --git a/services/clsi/app/js/CompileController.js b/services/clsi/app/js/CompileController.js index ddf83d7c8e..2813bca984 100644 --- a/services/clsi/app/js/CompileController.js +++ b/services/clsi/app/js/CompileController.js @@ -49,7 +49,9 @@ module.exports = CompileController = { } return CompileManager.doCompileWithLock(request, function ( error, - outputFiles + outputFiles, + stats, + timings ) { let code, status if (outputFiles == null) { @@ -118,18 +120,30 @@ module.exports = CompileController = { compile: { status, error: (error != null ? error.message : undefined) || error, - outputFiles: outputFiles.map((file) => ({ - url: - `${Settings.apis.clsi.url}/project/${request.project_id}` + - (request.user_id != null - ? `/user/${request.user_id}` - : '') + - (file.build != null ? `/build/${file.build}` : '') + - `/output/${file.path}`, - path: file.path, - type: file.type, - build: file.build - })) + stats, + timings, + outputFiles: outputFiles.map((file) => { + const record = { + url: + `${Settings.apis.clsi.url}/project/${request.project_id}` + + (request.user_id != null + ? `/user/${request.user_id}` + : '') + + (file.build != null ? `/build/${file.build}` : '') + + `/output/${file.path}`, + path: file.path, + type: file.type, + build: file.build, + contentId: file.contentId + } + if (file.ranges != null) { + record.ranges = file.ranges + } + if (file.size != null) { + record.size = file.size + } + return record + }) } }) }) diff --git a/services/clsi/app/js/CompileManager.js b/services/clsi/app/js/CompileManager.js index c801062595..c771082bfd 100644 --- a/services/clsi/app/js/CompileManager.js +++ b/services/clsi/app/js/CompileManager.js @@ -34,6 +34,7 @@ const os = require('os') const async = require('async') const Errors = require('./Errors') const CommandRunner = require('./CommandRunner') +const { emitPdfStats } = require('./ContentCacheMetrics') const getCompileName = function (project_id, user_id) { if (user_id != null) { @@ -77,6 +78,7 @@ module.exports = CompileManager = { const compileDir = getCompileDir(request.project_id, request.user_id) const outputDir = getOutputDir(request.project_id, request.user_id) + const timerE2E = new Metrics.Timer('compile-e2e') let timer = new Metrics.Timer('write-to-disk') logger.log( { project_id: request.project_id, user_id: request.user_id }, @@ -249,11 +251,13 @@ module.exports = CompileManager = { return callback(error) } Metrics.inc('compiles-succeeded') + stats = stats || {} const object = stats || {} for (metric_key in object) { metric_value = object[metric_key] Metrics.count(metric_key, metric_value) } + timings = timings || {} const object1 = timings || {} for (metric_key in object1) { metric_value = object1[metric_key] @@ -297,10 +301,32 @@ module.exports = CompileManager = { return callback(error) } return OutputCacheManager.saveOutputFiles( + { request, stats, timings }, outputFiles, compileDir, outputDir, - (error, newOutputFiles) => callback(null, newOutputFiles) + (err, newOutputFiles) => { + if (err) { + const { + project_id: projectId, + user_id: userId + } = request + logger.err( + { projectId, userId, err }, + 'failed to save output files' + ) + } + + // Emit compile time. + timings.compile = ts + timings.compileE2E = timerE2E.done() + + if (stats['pdf-size']) { + emitPdfStats(stats, timings) + } + + callback(null, newOutputFiles, stats, timings) + } ) } ) diff --git a/services/clsi/app/js/ContentCacheManager.js b/services/clsi/app/js/ContentCacheManager.js new file mode 100644 index 0000000000..94ed9ebc00 --- /dev/null +++ b/services/clsi/app/js/ContentCacheManager.js @@ -0,0 +1,118 @@ +/** + * ContentCacheManager - maintains a cache of stream hashes from a PDF file + */ + +const { callbackify } = require('util') +const fs = require('fs') +const crypto = require('crypto') +const Path = require('path') +const Settings = require('settings-sharelatex') + +const MIN_CHUNK_SIZE = Settings.pdfCachingMinChunkSize + +/** + * + * @param {String} contentDir path to directory where content hash files are cached + * @param {String} filePath the pdf file to scan for streams + */ +async function update(contentDir, filePath) { + const stream = fs.createReadStream(filePath) + const extractor = new PdfStreamsExtractor() + const ranges = [] + const newRanges = [] + for await (const chunk of stream) { + const pdfStreams = extractor.consume(chunk) + for (const pdfStream of pdfStreams) { + if (pdfStream.end - pdfStream.start < MIN_CHUNK_SIZE) continue + const hash = pdfStreamHash(pdfStream.buffers) + const range = { start: pdfStream.start, end: pdfStream.end, hash } + ranges.push(range) + if (await writePdfStream(contentDir, hash, pdfStream.buffers)) { + newRanges.push(range) + } + } + } + return [ranges, newRanges] +} + +class PdfStreamsExtractor { + constructor() { + this.fileIndex = 0 + this.inStream = false + this.streamStartIndex = 0 + this.buffers = [] + } + + consume(chunk) { + let chunkIndex = 0 + const pdfStreams = [] + while (true) { + if (!this.inStream) { + // Not in a stream, look for stream start + const index = chunk.indexOf('stream', chunkIndex) + if (index === -1) { + // Couldn't find stream start + break + } + // Found stream start, start a stream + this.inStream = true + this.streamStartIndex = this.fileIndex + index + chunkIndex = index + } else { + // In a stream, look for stream end + const index = chunk.indexOf('endstream', chunkIndex) + if (index === -1) { + this.buffers.push(chunk.slice(chunkIndex)) + break + } + // add "endstream" part + const endIndex = index + 9 + this.buffers.push(chunk.slice(chunkIndex, endIndex)) + pdfStreams.push({ + start: this.streamStartIndex, + end: this.fileIndex + endIndex, + buffers: this.buffers + }) + this.inStream = false + this.buffers = [] + chunkIndex = endIndex + } + } + this.fileIndex += chunk.length + return pdfStreams + } +} + +function pdfStreamHash(buffers) { + const hash = crypto.createHash('sha256') + for (const buffer of buffers) { + hash.update(buffer) + } + return hash.digest('hex') +} + +async function writePdfStream(dir, hash, buffers) { + const filename = Path.join(dir, hash) + try { + await fs.promises.stat(filename) + // The file exists. Do not rewrite the content. + // It would change the modified-time of the file and hence invalidate the + // ETags used for client side caching via browser internals. + return false + } catch (e) {} + const file = await fs.promises.open(filename, 'w') + if (Settings.enablePdfCachingDark) { + // Write an empty file in dark mode. + buffers = [] + } + try { + for (const buffer of buffers) { + await file.write(buffer) + } + } finally { + await file.close() + } + return true +} + +module.exports = { update: callbackify(update) } diff --git a/services/clsi/app/js/ContentCacheMetrics.js b/services/clsi/app/js/ContentCacheMetrics.js new file mode 100644 index 0000000000..04b475f827 --- /dev/null +++ b/services/clsi/app/js/ContentCacheMetrics.js @@ -0,0 +1,80 @@ +const Metrics = require('./Metrics') + +const ONE_MB = 1024 * 1024 + +function emitPdfStats(stats, timings) { + if (timings['compute-pdf-caching']) { + emitPdfCachingStats(stats, timings) + } else { + // How much bandwidth will the pdf incur when downloaded in full? + Metrics.summary('pdf-bandwidth', stats['pdf-size']) + } +} + +function emitPdfCachingStats(stats, timings) { + if (!stats['pdf-size']) return // double check + + // How large is the overhead of hashing up-front? + const fraction = + timings.compileE2E - timings['compute-pdf-caching'] !== 0 + ? timings.compileE2E / + (timings.compileE2E - timings['compute-pdf-caching']) + : 1 + Metrics.summary('overhead-compute-pdf-ranges', fraction * 100 - 100) + + // How does the hashing scale to pdf size in MB? + Metrics.timing( + 'compute-pdf-caching-relative-to-pdf-size', + timings['compute-pdf-caching'] / (stats['pdf-size'] / ONE_MB) + ) + if (stats['pdf-caching-total-ranges-size']) { + // How does the hashing scale to total ranges size in MB? + Metrics.timing( + 'compute-pdf-caching-relative-to-total-ranges-size', + timings['compute-pdf-caching'] / + (stats['pdf-caching-total-ranges-size'] / ONE_MB) + ) + // How fast is the hashing per range on average? + Metrics.timing( + 'compute-pdf-caching-relative-to-ranges-count', + timings['compute-pdf-caching'] / stats['pdf-caching-n-ranges'] + ) + + // How many ranges are new? + Metrics.summary( + 'new-pdf-ranges-relative-to-total-ranges', + (stats['pdf-caching-n-new-ranges'] / stats['pdf-caching-n-ranges']) * 100 + ) + } + + // How much content is cacheable? + Metrics.summary( + 'cacheable-ranges-to-pdf-size', + (stats['pdf-caching-total-ranges-size'] / stats['pdf-size']) * 100 + ) + + const sizeWhenDownloadedInFull = + // All of the pdf + stats['pdf-size'] - + // These ranges are potentially cached. + stats['pdf-caching-total-ranges-size'] + + // These ranges are not cached. + stats['pdf-caching-new-ranges-size'] + + // How much bandwidth can we save when downloading the pdf in full? + Metrics.summary( + 'pdf-bandwidth-savings', + 100 - (sizeWhenDownloadedInFull / stats['pdf-size']) * 100 + ) + + // How much bandwidth will the pdf incur when downloaded in full? + Metrics.summary('pdf-bandwidth', sizeWhenDownloadedInFull) + + // How much space do the ranges use? + // This will accumulate the ranges size over time, skipping already written ranges. + Metrics.summary('pdf-ranges-disk-size', stats['pdf-caching-new-ranges-size']) +} + +module.exports = { + emitPdfStats +} diff --git a/services/clsi/app/js/ContentController.js b/services/clsi/app/js/ContentController.js new file mode 100644 index 0000000000..76478defe8 --- /dev/null +++ b/services/clsi/app/js/ContentController.js @@ -0,0 +1,24 @@ +const Path = require('path') +const send = require('send') +const Settings = require('settings-sharelatex') +const OutputCacheManager = require('./OutputCacheManager') + +const ONE_DAY_S = 24 * 60 * 60 +const ONE_DAY_MS = ONE_DAY_S * 1000 + +function getPdfRange(req, res, next) { + const { projectId, userId, contentId, hash } = req.params + const perUserDir = userId ? `${projectId}-${userId}` : projectId + const path = Path.join( + Settings.path.outputDir, + perUserDir, + OutputCacheManager.CONTENT_SUBDIR, + contentId, + hash + ) + res.setHeader('cache-control', `public, max-age=${ONE_DAY_S}`) + res.setHeader('expires', new Date(Date.now() + ONE_DAY_MS).toUTCString()) + send(req, path).pipe(res) +} + +module.exports = { getPdfRange } diff --git a/services/clsi/app/js/OutputCacheManager.js b/services/clsi/app/js/OutputCacheManager.js index 8aefb9b77a..cd1bd883a2 100644 --- a/services/clsi/app/js/OutputCacheManager.js +++ b/services/clsi/app/js/OutputCacheManager.js @@ -22,10 +22,13 @@ const logger = require('logger-sharelatex') const _ = require('lodash') const Settings = require('settings-sharelatex') const crypto = require('crypto') +const Metrics = require('./Metrics') const OutputFileOptimiser = require('./OutputFileOptimiser') +const ContentCacheManager = require('./ContentCacheManager') module.exports = OutputCacheManager = { + CONTENT_SUBDIR: 'content', CACHE_SUBDIR: 'generated-files', ARCHIVE_SUBDIR: 'archived-logs', // build id is HEXDATE-HEXRANDOM from Date.now()and RandomBytes @@ -59,7 +62,13 @@ module.exports = OutputCacheManager = { }) }, - saveOutputFiles(outputFiles, compileDir, outputDir, callback) { + saveOutputFiles( + { request, stats, timings }, + outputFiles, + compileDir, + outputDir, + callback + ) { if (callback == null) { callback = function (error) {} } @@ -72,7 +81,31 @@ module.exports = OutputCacheManager = { compileDir, outputDir, buildId, - callback + function (err, result) { + if (err != null) { + return callback(err) + } + OutputCacheManager.collectOutputPdfSize( + result, + outputDir, + stats, + (err, result) => { + if (err) return callback(err, result) + + if (!Settings.enablePdfCaching || !request.enablePdfCaching) { + return callback(null, result) + } + + OutputCacheManager.saveStreamsInContentDir( + { stats, timings }, + result, + compileDir, + outputDir, + callback + ) + } + ) + } ) }) }, @@ -206,6 +239,107 @@ module.exports = OutputCacheManager = { }) }, + collectOutputPdfSize(outputFiles, outputDir, stats, callback) { + const outputFile = outputFiles.find((x) => x.path === 'output.pdf') + if (!outputFile) return callback(null, outputFiles) + const outputFilePath = Path.join( + outputDir, + OutputCacheManager.path(outputFile.build, outputFile.path) + ) + fs.stat(outputFilePath, (err, stat) => { + if (err) return callback(err, outputFiles) + + outputFile.size = stat.size + stats['pdf-size'] = outputFile.size + callback(null, outputFiles) + }) + }, + + saveStreamsInContentDir( + { stats, timings }, + outputFiles, + compileDir, + outputDir, + callback + ) { + const cacheRoot = Path.join(outputDir, OutputCacheManager.CONTENT_SUBDIR) + // check if content dir exists + OutputCacheManager.ensureContentDir(cacheRoot, function (err, contentDir) { + if (err) return callback(err, outputFiles) + + const outputFile = outputFiles.find((x) => x.path === 'output.pdf') + if (outputFile) { + // possibly we should copy the file from the build dir here + const outputFilePath = Path.join( + outputDir, + OutputCacheManager.path(outputFile.build, outputFile.path) + ) + const timer = new Metrics.Timer('compute-pdf-ranges') + ContentCacheManager.update(contentDir, outputFilePath, function ( + err, + ranges + ) { + if (err) return callback(err, outputFiles) + const [contentRanges, newContentRanges] = ranges + + if (Settings.enablePdfCachingDark) { + // In dark mode we are doing the computation only and do not emit + // any ranges to the frontend. + } else { + outputFile.contentId = Path.basename(contentDir) + outputFile.ranges = contentRanges + } + + timings['compute-pdf-caching'] = timer.done() + stats['pdf-caching-n-ranges'] = contentRanges.length + stats['pdf-caching-total-ranges-size'] = contentRanges.reduce( + (sum, next) => sum + (next.end - next.start), + 0 + ) + stats['pdf-caching-n-new-ranges'] = newContentRanges.length + stats['pdf-caching-new-ranges-size'] = newContentRanges.reduce( + (sum, next) => sum + (next.end - next.start), + 0 + ) + callback(null, outputFiles) + }) + } else { + callback(null, outputFiles) + } + }) + }, + + ensureContentDir(contentRoot, callback) { + fse.ensureDir(contentRoot, function (err) { + if (err != null) { + return callback(err) + } + fs.readdir(contentRoot, function (err, results) { + const dirs = results.sort() + const contentId = dirs.find((dir) => + OutputCacheManager.BUILD_REGEX.test(dir) + ) + if (contentId) { + callback(null, Path.join(contentRoot, contentId)) + } else { + // make a content directory + OutputCacheManager.generateBuildId(function (err, contentId) { + if (err) { + return callback(err) + } + const contentDir = Path.join(contentRoot, contentId) + fse.ensureDir(contentDir, function (err) { + if (err) { + return callback(err) + } + return callback(null, contentDir) + }) + }) + } + }) + }) + }, + archiveLogs(outputFiles, compileDir, outputDir, buildId, callback) { if (callback == null) { callback = function (error) {} diff --git a/services/clsi/app/js/RequestParser.js b/services/clsi/app/js/RequestParser.js index 4377204428..66c917aeed 100644 --- a/services/clsi/app/js/RequestParser.js +++ b/services/clsi/app/js/RequestParser.js @@ -50,6 +50,14 @@ module.exports = RequestParser = { type: 'string' } ) + response.enablePdfCaching = this._parseAttribute( + 'enablePdfCaching', + compile.options.enablePdfCaching, + { + default: false, + type: 'boolean' + } + ) response.timeout = this._parseAttribute( 'timeout', compile.options.timeout, diff --git a/services/clsi/config/settings.defaults.js b/services/clsi/config/settings.defaults.js index a0ad8433aa..b65a019365 100644 --- a/services/clsi/config/settings.defaults.js +++ b/services/clsi/config/settings.defaults.js @@ -63,7 +63,12 @@ module.exports = { texliveOpenoutAny: process.env.TEXLIVE_OPENOUT_ANY, sentry: { dsn: process.env.SENTRY_DSN - } + }, + + enablePdfCaching: process.env.ENABLE_PDF_CACHING === 'true', + enablePdfCachingDark: process.env.ENABLE_PDF_CACHING_DARK === 'true', + pdfCachingMinChunkSize: + parseInt(process.env.PDF_CACHING_MIN_CHUNK_SIZE, 10) || 1024 } if (process.env.ALLOWED_COMPILE_GROUPS) { diff --git a/services/clsi/docker-compose.ci.yml b/services/clsi/docker-compose.ci.yml index 2237933754..245059ad98 100644 --- a/services/clsi/docker-compose.ci.yml +++ b/services/clsi/docker-compose.ci.yml @@ -29,6 +29,7 @@ services: NODE_ENV: test NODE_OPTIONS: "--unhandled-rejections=strict" TEXLIVE_IMAGE: + ENABLE_PDF_CACHING: "true" command: npm run test:acceptance:_run diff --git a/services/clsi/docker-compose.yml b/services/clsi/docker-compose.yml index 6688000204..ee68a7c0ee 100644 --- a/services/clsi/docker-compose.yml +++ b/services/clsi/docker-compose.yml @@ -38,5 +38,6 @@ services: LOG_LEVEL: ERROR NODE_ENV: test NODE_OPTIONS: "--unhandled-rejections=strict" + ENABLE_PDF_CACHING: "true" command: npm run --silent test:acceptance diff --git a/services/clsi/test/acceptance/js/Stats.js b/services/clsi/test/acceptance/js/Stats.js new file mode 100644 index 0000000000..87b20b1cc9 --- /dev/null +++ b/services/clsi/test/acceptance/js/Stats.js @@ -0,0 +1,16 @@ +const request = require('request') +const Settings = require('settings-sharelatex') +after(function (done) { + request( + { + url: `${Settings.apis.clsi.url}/metrics` + }, + (err, response, body) => { + if (err) return done(err) + console.error('-- metrics --') + console.error(body) + console.error('-- metrics --') + done() + } + ) +}) diff --git a/services/clsi/test/acceptance/js/helpers/Client.js b/services/clsi/test/acceptance/js/helpers/Client.js index 43825ae415..c5814ff3ef 100644 --- a/services/clsi/test/acceptance/js/helpers/Client.js +++ b/services/clsi/test/acceptance/js/helpers/Client.js @@ -30,6 +30,10 @@ module.exports = Client = { if (callback == null) { callback = function (error, res, body) {} } + if (data) { + // Enable pdf caching unless disabled explicitly. + data.options = Object.assign({}, { enablePdfCaching: true }, data.options) + } return request.post( { url: `${this.host}/project/${project_id}/compile`, diff --git a/services/clsi/test/unit/js/CompileControllerTests.js b/services/clsi/test/unit/js/CompileControllerTests.js index 6f256d2955..6d6b34ee58 100644 --- a/services/clsi/test/unit/js/CompileControllerTests.js +++ b/services/clsi/test/unit/js/CompileControllerTests.js @@ -113,6 +113,8 @@ describe('CompileController', function () { this.ProjectPersistenceManager.markProjectAsJustAccessed = sinon .stub() .callsArg(1) + this.stats = { foo: 1 } + this.timings = { bar: 2 } this.res.status = sinon.stub().returnsThis() return (this.res.send = sinon.stub()) }) @@ -121,7 +123,7 @@ describe('CompileController', function () { beforeEach(function () { this.CompileManager.doCompileWithLock = sinon .stub() - .callsArgWith(1, null, this.output_files) + .yields(null, this.output_files, this.stats, this.timings) return this.CompileController.compile(this.req, this.res) }) @@ -150,12 +152,16 @@ describe('CompileController', function () { compile: { status: 'success', error: null, + stats: this.stats, + timings: this.timings, outputFiles: this.output_files.map((file) => { return { url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`, path: file.path, type: file.type, - build: file.build + build: file.build, + // gets dropped by JSON.stringify + contentId: undefined } }) } @@ -180,7 +186,7 @@ describe('CompileController', function () { ] this.CompileManager.doCompileWithLock = sinon .stub() - .callsArgWith(1, null, this.output_files) + .yields(null, this.output_files, this.stats, this.timings) this.CompileController.compile(this.req, this.res) }) @@ -191,12 +197,16 @@ describe('CompileController', function () { compile: { status: 'failure', error: null, + stats: this.stats, + timings: this.timings, outputFiles: this.output_files.map((file) => { return { url: `${this.Settings.apis.clsi.url}/project/${this.project_id}/build/${file.build}/output/${file.path}`, path: file.path, type: file.type, - build: file.build + build: file.build, + // gets dropped by JSON.stringify + contentId: undefined } }) } @@ -220,7 +230,10 @@ describe('CompileController', function () { compile: { status: 'error', error: this.message, - outputFiles: [] + outputFiles: [], + // JSON.stringify will omit these + stats: undefined, + timings: undefined } }) .should.equal(true) @@ -244,7 +257,10 @@ describe('CompileController', function () { compile: { status: 'timedout', error: this.message, - outputFiles: [] + outputFiles: [], + // JSON.stringify will omit these + stats: undefined, + timings: undefined } }) .should.equal(true) @@ -266,7 +282,10 @@ describe('CompileController', function () { compile: { error: null, status: 'failure', - outputFiles: [] + outputFiles: [], + // JSON.stringify will omit these + stats: undefined, + timings: undefined } }) .should.equal(true) diff --git a/services/clsi/test/unit/js/CompileManagerTests.js b/services/clsi/test/unit/js/CompileManagerTests.js index 2d40bdd64a..97e318a4a6 100644 --- a/services/clsi/test/unit/js/CompileManagerTests.js +++ b/services/clsi/test/unit/js/CompileManagerTests.js @@ -71,7 +71,7 @@ describe('CompileManager', function () { this.compileDir = `${this.Settings.path.compilesDir}/${this.project_id}-${this.user_id}` this.CompileManager.doCompile = sinon .stub() - .callsArgWith(1, null, this.output_files) + .yields(null, this.output_files) return (this.LockManager.runWithLock = (lockFile, runner, callback) => runner((err, ...result) => callback(err, ...Array.from(result)))) }) @@ -172,10 +172,10 @@ describe('CompileManager', function () { this.LatexRunner.runLatex = sinon.stub().callsArg(2) this.OutputFileFinder.findOutputFiles = sinon .stub() - .callsArgWith(2, null, this.output_files) + .yields(null, this.output_files) this.OutputCacheManager.saveOutputFiles = sinon .stub() - .callsArgWith(3, null, this.build_files) + .yields(null, this.build_files) this.DraftModeManager.injectDraftMode = sinon.stub().callsArg(1) return (this.TikzManager.checkMainFile = sinon.stub().callsArg(3, false)) })