diff --git a/services/clsi/Dockerfile b/services/clsi/Dockerfile index ce582cfae3..603d6ddf7a 100644 --- a/services/clsi/Dockerfile +++ b/services/clsi/Dockerfile @@ -32,6 +32,7 @@ COPY libraries/overleaf-editor-core/package.json /overleaf/libraries/overleaf-ed COPY libraries/promise-utils/package.json /overleaf/libraries/promise-utils/package.json COPY libraries/settings/package.json /overleaf/libraries/settings/package.json COPY libraries/stream-utils/package.json /overleaf/libraries/stream-utils/package.json +COPY libraries/validation-tools/package.json /overleaf/libraries/validation-tools/package.json COPY services/clsi/package.json /overleaf/services/clsi/package.json COPY .yarn/patches/ /overleaf/.yarn/patches/ @@ -45,6 +46,7 @@ COPY libraries/overleaf-editor-core/ /overleaf/libraries/overleaf-editor-core/ COPY libraries/promise-utils/ /overleaf/libraries/promise-utils/ COPY libraries/settings/ /overleaf/libraries/settings/ COPY libraries/stream-utils/ /overleaf/libraries/stream-utils/ +COPY libraries/validation-tools/ /overleaf/libraries/validation-tools/ COPY services/clsi/ /overleaf/services/clsi/ FROM app AS with-texlive diff --git a/services/clsi/Makefile b/services/clsi/Makefile index 8a70dc8c18..4ab3f5e6ff 100644 --- a/services/clsi/Makefile +++ b/services/clsi/Makefile @@ -25,6 +25,7 @@ IMAGE_CACHE ?= $(IMAGE_REPO):cache-$(shell cat \ $(MONOREPO)/libraries/promise-utils/package.json \ $(MONOREPO)/libraries/settings/package.json \ $(MONOREPO)/libraries/stream-utils/package.json \ + $(MONOREPO)/libraries/validation-tools/package.json \ $(MONOREPO)/services/clsi/package.json \ $(MONOREPO)/.yarn/patches/* \ | sha256sum | cut -d '-' -f1) @@ -169,8 +170,9 @@ test_acceptance_clean: $(DOCKER_COMPOSE_TEST_ACCEPTANCE) down -v -t 0 test_acceptance_pre_run: - docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc:3.9 - docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc-staging:3.9 + -docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc:3.9 + -docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc-staging:3.9 + -cd ../../ && docker build -t us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pdftocairo:24.02 dockerfiles/pdftocairo ifneq (,$(wildcard test/acceptance/js/scripts/pre-run)) $(DOCKER_COMPOSE_TEST_ACCEPTANCE) run $(DC_RUN_FLAGS) test_acceptance test/acceptance/js/scripts/pre-run endif diff --git a/services/clsi/app.js b/services/clsi/app.js index 7e1fb1b760..231e72dd86 100644 --- a/services/clsi/app.js +++ b/services/clsi/app.js @@ -145,6 +145,11 @@ app.post( bodyParser.json({ limit: Settings.compileSizeLimit }), ConversionController.convertProjectToDocument ) +app.post( + '/convert/pdf-to-jpeg', + FileUploadMiddleware.multerMiddleware, + ConversionController.convertPDFToJPEG +) if (process.env.NODE_ENV === 'development' && global.__coverage__) { app.get('/coverage', (req, res) => { diff --git a/services/clsi/app/js/ConversionController.js b/services/clsi/app/js/ConversionController.js index 745349f0f5..4f79e921c8 100644 --- a/services/clsi/app/js/ConversionController.js +++ b/services/clsi/app/js/ConversionController.js @@ -14,6 +14,7 @@ import RequestParser from './RequestParser.js' import { pipeline } from 'node:stream/promises' import Settings from '@overleaf/settings' import Path from 'node:path' +import { z } from '@overleaf/validation-tools' const CONVERSION_CONFIGS = { docx: { extension: 'docx' }, @@ -77,6 +78,51 @@ async function convertDocumentToLaTeX(req, res) { } } +const PDFToJPEGQuerySchema = z.object({ + mode: z.enum(['preview', 'thumbnail']), +}) + +async function convertPDFToJPEG(req, res) { + const { path } = req.file + if (!Settings.enablePdfConversions) { + await fs.unlink(path).catch(() => {}) + return res.sendStatus(404) + } + const parsed = PDFToJPEGQuerySchema.safeParse(req.query) + if (!parsed.success) { + await fs.unlink(path).catch(() => {}) + return res.sendStatus(400) + } + const { mode } = parsed.data + logger.debug({ path, mode }, 'received pdf for conversion to jpeg') + const conversionId = crypto.randomUUID() + let jpegPath + try { + jpegPath = await ConversionManager.promises.convertPDFToJPEGWithLock( + conversionId, + path, + mode + ) + } finally { + await fs.unlink(path).catch(() => {}) + } + + try { + const jpegStat = await fs.stat(jpegPath) + + res.setHeader('Content-Length', jpegStat.size) + res.attachment('output.jpg') + res.setHeader('X-Content-Type-Options', 'nosniff') + + const readStream = fsSync.createReadStream(jpegPath) + await pipeline(readStream, res) + } finally { + await fs + .rm(Path.dirname(jpegPath), { recursive: true, force: true }) + .catch(() => {}) + } +} + async function convertProjectToDocument(req, res) { if (!Settings.enablePandocConversions) { return res.sendStatus(404) @@ -207,4 +253,5 @@ async function convertProjectToDocument(req, res) { export default { convertDocumentToLaTeX: expressify(convertDocumentToLaTeX), convertProjectToDocument: expressify(convertProjectToDocument), + convertPDFToJPEG: expressify(convertPDFToJPEG), } diff --git a/services/clsi/app/js/ConversionManager.js b/services/clsi/app/js/ConversionManager.js index 6f37a69002..d181744878 100644 --- a/services/clsi/app/js/ConversionManager.js +++ b/services/clsi/app/js/ConversionManager.js @@ -18,6 +18,18 @@ const CONVERSION_CONFIGS = { }, } +const PDF_TO_JPEG_CONFIGS = { + preview: { width: 794, quality: 90 }, + thumbnail: { width: 190, quality: 50 }, +} + +const PDF_TO_JPEG_INPUT_FILENAME = 'input.pdf' +const PDF_TO_JPEG_OUTPUT_FILENAME = 'output.jpg' +const PDF_TO_JPEG_OUTPUT_BASENAME = Path.basename( + PDF_TO_JPEG_OUTPUT_FILENAME, + '.jpg' +) + async function convertToLaTeXWithLock(conversionId, inputPath, conversionType) { const conversionDir = Path.join(Settings.path.compilesDir, conversionId) const lock = LockManager.acquire(conversionDir) @@ -298,9 +310,76 @@ async function convertLaTeXToDocumentInDir( return Path.join(compileDir, finalOutputName) } +async function convertPDFToJPEGWithLock(conversionId, inputPath, mode) { + const conversionDir = Path.join(Settings.path.compilesDir, conversionId) + const lock = LockManager.acquire(conversionDir) + try { + return await convertPDFToJPEG(conversionId, conversionDir, inputPath, mode) + } finally { + lock.release() + } +} + +async function convertPDFToJPEG(conversionId, conversionDir, inputPath, mode) { + const config = PDF_TO_JPEG_CONFIGS[mode] + await fs.mkdir(conversionDir, { recursive: true }) + const newSourcePath = Path.join(conversionDir, PDF_TO_JPEG_INPUT_FILENAME) + await fs.copyFile(inputPath, newSourcePath) + const dstPath = Path.join(conversionDir, PDF_TO_JPEG_OUTPUT_FILENAME) + + try { + const { stdout, stderr, exitCode } = await CommandRunner.promises.run( + conversionId, + [ + 'pdftocairo', + '-jpeg', + '-jpegopt', + `quality=${config.quality}`, + '-singlefile', + '-scale-to-x', + config.width.toString(), + '-scale-to-y', + '-1', // maintain aspect ratio + PDF_TO_JPEG_INPUT_FILENAME, + PDF_TO_JPEG_OUTPUT_BASENAME, + ], + conversionDir, + Settings.pdftocairoImage, + Settings.conversionTimeoutSeconds * 1000, + {}, + 'conversions', + null + ) + if (exitCode !== 0) { + throw new OError('Non-zero exit code from pdftocairo', { + exitCode, + stderr, + }) + } + logger.debug( + { stdout, stderr, exitCode }, + 'pdf-to-jpeg conversion completed' + ) + + const stat = await fs.lstat(dstPath) + if (!stat.isFile()) { + throw new OError('output.jpg is not a regular file', { stat }) + } + + // Clean up the source PDF to leave only the conversion result + await fs.unlink(newSourcePath).catch(() => {}) + } catch (error) { + await fs.rm(conversionDir, { force: true, recursive: true }).catch(() => {}) + throw new OError('pdf-to-jpeg conversion failed').withCause(error) + } + + return dstPath +} + export default { promises: { convertToLaTeXWithLock, convertLaTeXToDocumentInDirWithLock, + convertPDFToJPEGWithLock, }, } diff --git a/services/clsi/buildscript.txt b/services/clsi/buildscript.txt index 57254d5711..c62ead4875 100644 --- a/services/clsi/buildscript.txt +++ b/services/clsi/buildscript.txt @@ -1,7 +1,7 @@ clsi --data-dirs=cache,compiles,output --dependencies= ---env-add=DOWNLOAD_HOST=http://clsi-nginx:8080,ALLOWED_COMPILE_GROUPS=clsi-perf simple-latex-file,ENABLE_PDF_CACHING=true,PDF_CACHING_ENABLE_WORKER_POOL=true,ALLOWED_IMAGES=quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9,TEXLIVE_IMAGE=quay.io/sharelatex/texlive-full:2025.1,TEX_LIVE_IMAGE_NAME_OVERRIDE=us-east1-docker.pkg.dev/overleaf-ops/ol-docker,TEXLIVE_IMAGE_USER=tex,SANDBOXED_COMPILES=true,SANDBOXED_COMPILES_HOST_DIR_COMPILES=$PWD/compiles,SANDBOXED_COMPILES_HOST_DIR_OUTPUT=$PWD/output,ENABLE_PANDOC_CONVERSIONS=true +--env-add=DOWNLOAD_HOST=http://clsi-nginx:8080,ALLOWED_COMPILE_GROUPS=clsi-perf simple-latex-file,ENABLE_PDF_CACHING=true,PDF_CACHING_ENABLE_WORKER_POOL=true,ALLOWED_IMAGES=quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 quay.io/sharelatex/pdftocairo:24.02,TEXLIVE_IMAGE=quay.io/sharelatex/texlive-full:2025.1,TEX_LIVE_IMAGE_NAME_OVERRIDE=us-east1-docker.pkg.dev/overleaf-ops/ol-docker,TEXLIVE_IMAGE_USER=tex,SANDBOXED_COMPILES=true,SANDBOXED_COMPILES_HOST_DIR_COMPILES=$PWD/compiles,SANDBOXED_COMPILES_HOST_DIR_OUTPUT=$PWD/output,ENABLE_PANDOC_CONVERSIONS=true,ENABLE_PDF_CONVERSIONS=true --env-pass-through= --esmock-loader=False --node-version=24.14.1 diff --git a/services/clsi/config/settings.defaults.cjs b/services/clsi/config/settings.defaults.cjs index 074398874d..33d3330853 100644 --- a/services/clsi/config/settings.defaults.cjs +++ b/services/clsi/config/settings.defaults.cjs @@ -31,6 +31,9 @@ module.exports = { parseInt(process.env.CLSI_CONVERSION_TIMEOUT_SECONDS, 10) || 60, pandocImage: process.env.PANDOC_IMAGE || 'quay.io/sharelatex/pandoc:3.9', enablePandocConversions: process.env.ENABLE_PANDOC_CONVERSIONS === 'true', + pdftocairoImage: + process.env.PDFTOCAIRO_IMAGE || 'quay.io/sharelatex/pdftocairo:24.02', + enablePdfConversions: process.env.ENABLE_PDF_CONVERSIONS === 'true', maxUploadSize: 50 * 1024 * 1024, internal: { diff --git a/services/clsi/docker-compose.ci.yml b/services/clsi/docker-compose.ci.yml index 9f206bfdd7..a02654cbd2 100644 --- a/services/clsi/docker-compose.ci.yml +++ b/services/clsi/docker-compose.ci.yml @@ -30,7 +30,7 @@ services: ALLOWED_COMPILE_GROUPS: clsi-perf simple-latex-file ENABLE_PDF_CACHING: true PDF_CACHING_ENABLE_WORKER_POOL: true - ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 + ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 quay.io/sharelatex/pdftocairo:24.02 TEXLIVE_IMAGE: quay.io/sharelatex/texlive-full:2025.1 TEX_LIVE_IMAGE_NAME_OVERRIDE: us-east1-docker.pkg.dev/overleaf-ops/ol-docker TEXLIVE_IMAGE_USER: tex @@ -38,6 +38,7 @@ services: SANDBOXED_COMPILES_HOST_DIR_COMPILES: $PWD/compiles SANDBOXED_COMPILES_HOST_DIR_OUTPUT: $PWD/output ENABLE_PANDOC_CONVERSIONS: true + ENABLE_PDF_CONVERSIONS: true volumes: - ./reports:/overleaf/services/clsi/reports - ./compiles:/overleaf/services/clsi/compiles diff --git a/services/clsi/docker-compose.yml b/services/clsi/docker-compose.yml index b6590d4f1e..1ea4549b20 100644 --- a/services/clsi/docker-compose.yml +++ b/services/clsi/docker-compose.yml @@ -53,7 +53,7 @@ services: ALLOWED_COMPILE_GROUPS: clsi-perf simple-latex-file ENABLE_PDF_CACHING: true PDF_CACHING_ENABLE_WORKER_POOL: true - ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 + ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 quay.io/sharelatex/pdftocairo:24.02 TEXLIVE_IMAGE: quay.io/sharelatex/texlive-full:2025.1 TEX_LIVE_IMAGE_NAME_OVERRIDE: us-east1-docker.pkg.dev/overleaf-ops/ol-docker TEXLIVE_IMAGE_USER: tex @@ -61,6 +61,7 @@ services: SANDBOXED_COMPILES_HOST_DIR_COMPILES: $PWD/compiles SANDBOXED_COMPILES_HOST_DIR_OUTPUT: $PWD/output ENABLE_PANDOC_CONVERSIONS: true + ENABLE_PDF_CONVERSIONS: true depends_on: clsi-nginx: condition: service_started diff --git a/services/clsi/package.json b/services/clsi/package.json index d6d5210388..c9abae6c72 100644 --- a/services/clsi/package.json +++ b/services/clsi/package.json @@ -23,6 +23,7 @@ "@overleaf/promise-utils": "workspace:*", "@overleaf/settings": "workspace:*", "@overleaf/stream-utils": "workspace:*", + "@overleaf/validation-tools": "workspace:*", "archiver": "5.3.2", "async": "^3.2.5", "body-parser": "1.20.4", diff --git a/services/clsi/test/acceptance/js/PdfToJpegConversionTests.js b/services/clsi/test/acceptance/js/PdfToJpegConversionTests.js new file mode 100644 index 0000000000..1006982427 --- /dev/null +++ b/services/clsi/test/acceptance/js/PdfToJpegConversionTests.js @@ -0,0 +1,83 @@ +import Client from './helpers/Client.js' +import ClsiApp from './helpers/ClsiApp.js' +import Path from 'node:path' +import fs from 'node:fs/promises' +import { promisify } from 'node:util' +import { execFile as execFileCb } from 'node:child_process' +import { expect } from 'chai' + +const execFile = promisify(execFileCb) + +const FIXTURE_PDF = Path.join(import.meta.dirname, '../fixtures/minimal.pdf') + +const MODE_EXPECTATIONS = { + preview: { width: 794 }, + thumbnail: { width: 190 }, +} + +async function writeResponseToTempfile(response) { + const buffer = Buffer.from(await response.arrayBuffer()) + const tmpPath = `/tmp/clsi-acceptance-pdf-to-jpeg-${crypto.randomUUID()}.jpg` + await fs.writeFile(tmpPath, buffer) + return { tmpPath, buffer } +} + +describe('pdf-to-jpeg conversion', function () { + before(async function () { + await ClsiApp.ensureRunning() + }) + + for (const [mode, { width: expectedWidth }] of Object.entries( + MODE_EXPECTATIONS + )) { + describe(`with mode=${mode}`, function () { + let response + let tmpPath + let buffer + + before(async function () { + response = await Client.convertPdfToJpeg(FIXTURE_PDF, mode) + expect(response.status).to.equal(200) + ;({ tmpPath, buffer } = await writeResponseToTempfile(response)) + }) + + after(async function () { + if (tmpPath) { + await fs.unlink(tmpPath).catch(() => {}) + } + }) + + it('returns a JPEG (per `file`)', async function () { + const { stdout } = await execFile('file', ['--brief', tmpPath]) + expect(stdout).to.match(/JPEG image data/) + }) + + it(`has the expected width of ${expectedWidth}px`, async function () { + const { stdout } = await execFile('identify', [ + '-format', + '%w %h', + tmpPath, + ]) + const [width, height] = stdout.trim().split(' ').map(Number) + expect(width).to.equal(expectedWidth) + // A4 portrait is taller than wide; height must be positive and + // larger than the width (so the aspect ratio was preserved). + expect(height).to.be.greaterThan(width) + }) + + it('returns a non-empty body matching Content-Length', function () { + expect(buffer.length).to.be.greaterThan(0) + expect(buffer.length).to.equal( + Number(response.headers.get('content-length')) + ) + }) + }) + } + + describe('with an unsupported mode', function () { + it('returns 400', async function () { + const response = await Client.convertPdfToJpeg(FIXTURE_PDF, 'not-a-mode') + expect(response.status).to.equal(400) + }) + }) +}) diff --git a/services/clsi/test/acceptance/js/helpers/Client.js b/services/clsi/test/acceptance/js/helpers/Client.js index 67f78161f1..aab13f16f3 100644 --- a/services/clsi/test/acceptance/js/helpers/Client.js +++ b/services/clsi/test/acceptance/js/helpers/Client.js @@ -53,6 +53,16 @@ async function convertDocument(path, type) { } } +async function convertPdfToJpeg(path, mode) { + const formData = new FormData() + formData.append('qqfile', await fsPromises.readFile(path), 'input.pdf') + return await fetch(`${host}/convert/pdf-to-jpeg?mode=${mode}`, { + method: 'POST', + headers: formData.getHeaders(), + body: formData.getBuffer(), + }) +} + async function convertProjectToDocument( projectId, userId, @@ -239,6 +249,7 @@ export default { compile, convertProjectToDocument, convertDocument, + convertPdfToJpeg, stopCompile, clearCache, getOutputFile, diff --git a/services/clsi/test/unit/js/ConversionController.test.js b/services/clsi/test/unit/js/ConversionController.test.js index 7494d0561d..365f07d05a 100644 --- a/services/clsi/test/unit/js/ConversionController.test.js +++ b/services/clsi/test/unit/js/ConversionController.test.js @@ -18,6 +18,7 @@ describe('ConversionController', function () { ctx.documentStat = { size: 5678 } ctx.Settings = { enablePandocConversions: true, + enablePdfConversions: true, path: { compilesDir: '/compiles', outputDir: '/output', diff --git a/yarn.lock b/yarn.lock index 5ba68c1939..cd21422140 100644 --- a/yarn.lock +++ b/yarn.lock @@ -6484,6 +6484,7 @@ __metadata: "@overleaf/promise-utils": "workspace:*" "@overleaf/settings": "workspace:*" "@overleaf/stream-utils": "workspace:*" + "@overleaf/validation-tools": "workspace:*" archiver: "npm:5.3.2" async: "npm:^3.2.5" body-parser: "npm:1.20.4"