[clsi] initial version of /convert/pdf-to-jpeg (#33752)
* [monorepo] consolidate clsi-lb host/ip env-vars Target env-var is CLSI_LB_HOST. Keep CLSI_LB_IP populated for a week. * [clsi] initial version of /convert/pdf-to-jpeg * [rails] use fake-secrets in CI and Codespaces * [rails] adapt tests for using clsi to convert PDFs to image * [rails] add rake task for comparing clsi conversion with transloadit * [clsi] double check that output.jpg is a regular file Co-authored-by: Brian Gough <brian.gough@overleaf.com> * [clsi] fix composing basename * [monorepo] fix clsi-lb host env-var post merge * [monorepo] sort dev-environment.env hosts * [rails] use local pdf file rather than downloading it again Download from the old renderer code path still. It's dead code. * [terraform] clsi: enable pdf to jpg conversion --------- Co-authored-by: Brian Gough <brian.gough@overleaf.com> GitOrigin-RevId: 5ecaa8559d299486340bb3961f06b29f7c4dfcca
This commit is contained in:
@@ -32,6 +32,7 @@ COPY libraries/overleaf-editor-core/package.json /overleaf/libraries/overleaf-ed
|
||||
COPY libraries/promise-utils/package.json /overleaf/libraries/promise-utils/package.json
|
||||
COPY libraries/settings/package.json /overleaf/libraries/settings/package.json
|
||||
COPY libraries/stream-utils/package.json /overleaf/libraries/stream-utils/package.json
|
||||
COPY libraries/validation-tools/package.json /overleaf/libraries/validation-tools/package.json
|
||||
COPY services/clsi/package.json /overleaf/services/clsi/package.json
|
||||
COPY .yarn/patches/ /overleaf/.yarn/patches/
|
||||
|
||||
@@ -45,6 +46,7 @@ COPY libraries/overleaf-editor-core/ /overleaf/libraries/overleaf-editor-core/
|
||||
COPY libraries/promise-utils/ /overleaf/libraries/promise-utils/
|
||||
COPY libraries/settings/ /overleaf/libraries/settings/
|
||||
COPY libraries/stream-utils/ /overleaf/libraries/stream-utils/
|
||||
COPY libraries/validation-tools/ /overleaf/libraries/validation-tools/
|
||||
COPY services/clsi/ /overleaf/services/clsi/
|
||||
|
||||
FROM app AS with-texlive
|
||||
|
||||
@@ -25,6 +25,7 @@ IMAGE_CACHE ?= $(IMAGE_REPO):cache-$(shell cat \
|
||||
$(MONOREPO)/libraries/promise-utils/package.json \
|
||||
$(MONOREPO)/libraries/settings/package.json \
|
||||
$(MONOREPO)/libraries/stream-utils/package.json \
|
||||
$(MONOREPO)/libraries/validation-tools/package.json \
|
||||
$(MONOREPO)/services/clsi/package.json \
|
||||
$(MONOREPO)/.yarn/patches/* \
|
||||
| sha256sum | cut -d '-' -f1)
|
||||
@@ -169,8 +170,9 @@ test_acceptance_clean:
|
||||
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) down -v -t 0
|
||||
|
||||
test_acceptance_pre_run:
|
||||
docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc:3.9
|
||||
docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc-staging:3.9
|
||||
-docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc:3.9
|
||||
-docker pull us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pandoc-staging:3.9
|
||||
-cd ../../ && docker build -t us-east1-docker.pkg.dev/overleaf-ops/ol-docker/pdftocairo:24.02 dockerfiles/pdftocairo
|
||||
ifneq (,$(wildcard test/acceptance/js/scripts/pre-run))
|
||||
$(DOCKER_COMPOSE_TEST_ACCEPTANCE) run $(DC_RUN_FLAGS) test_acceptance test/acceptance/js/scripts/pre-run
|
||||
endif
|
||||
|
||||
@@ -145,6 +145,11 @@ app.post(
|
||||
bodyParser.json({ limit: Settings.compileSizeLimit }),
|
||||
ConversionController.convertProjectToDocument
|
||||
)
|
||||
app.post(
|
||||
'/convert/pdf-to-jpeg',
|
||||
FileUploadMiddleware.multerMiddleware,
|
||||
ConversionController.convertPDFToJPEG
|
||||
)
|
||||
|
||||
if (process.env.NODE_ENV === 'development' && global.__coverage__) {
|
||||
app.get('/coverage', (req, res) => {
|
||||
|
||||
@@ -14,6 +14,7 @@ import RequestParser from './RequestParser.js'
|
||||
import { pipeline } from 'node:stream/promises'
|
||||
import Settings from '@overleaf/settings'
|
||||
import Path from 'node:path'
|
||||
import { z } from '@overleaf/validation-tools'
|
||||
|
||||
const CONVERSION_CONFIGS = {
|
||||
docx: { extension: 'docx' },
|
||||
@@ -77,6 +78,51 @@ async function convertDocumentToLaTeX(req, res) {
|
||||
}
|
||||
}
|
||||
|
||||
const PDFToJPEGQuerySchema = z.object({
|
||||
mode: z.enum(['preview', 'thumbnail']),
|
||||
})
|
||||
|
||||
async function convertPDFToJPEG(req, res) {
|
||||
const { path } = req.file
|
||||
if (!Settings.enablePdfConversions) {
|
||||
await fs.unlink(path).catch(() => {})
|
||||
return res.sendStatus(404)
|
||||
}
|
||||
const parsed = PDFToJPEGQuerySchema.safeParse(req.query)
|
||||
if (!parsed.success) {
|
||||
await fs.unlink(path).catch(() => {})
|
||||
return res.sendStatus(400)
|
||||
}
|
||||
const { mode } = parsed.data
|
||||
logger.debug({ path, mode }, 'received pdf for conversion to jpeg')
|
||||
const conversionId = crypto.randomUUID()
|
||||
let jpegPath
|
||||
try {
|
||||
jpegPath = await ConversionManager.promises.convertPDFToJPEGWithLock(
|
||||
conversionId,
|
||||
path,
|
||||
mode
|
||||
)
|
||||
} finally {
|
||||
await fs.unlink(path).catch(() => {})
|
||||
}
|
||||
|
||||
try {
|
||||
const jpegStat = await fs.stat(jpegPath)
|
||||
|
||||
res.setHeader('Content-Length', jpegStat.size)
|
||||
res.attachment('output.jpg')
|
||||
res.setHeader('X-Content-Type-Options', 'nosniff')
|
||||
|
||||
const readStream = fsSync.createReadStream(jpegPath)
|
||||
await pipeline(readStream, res)
|
||||
} finally {
|
||||
await fs
|
||||
.rm(Path.dirname(jpegPath), { recursive: true, force: true })
|
||||
.catch(() => {})
|
||||
}
|
||||
}
|
||||
|
||||
async function convertProjectToDocument(req, res) {
|
||||
if (!Settings.enablePandocConversions) {
|
||||
return res.sendStatus(404)
|
||||
@@ -207,4 +253,5 @@ async function convertProjectToDocument(req, res) {
|
||||
export default {
|
||||
convertDocumentToLaTeX: expressify(convertDocumentToLaTeX),
|
||||
convertProjectToDocument: expressify(convertProjectToDocument),
|
||||
convertPDFToJPEG: expressify(convertPDFToJPEG),
|
||||
}
|
||||
|
||||
@@ -18,6 +18,18 @@ const CONVERSION_CONFIGS = {
|
||||
},
|
||||
}
|
||||
|
||||
const PDF_TO_JPEG_CONFIGS = {
|
||||
preview: { width: 794, quality: 90 },
|
||||
thumbnail: { width: 190, quality: 50 },
|
||||
}
|
||||
|
||||
const PDF_TO_JPEG_INPUT_FILENAME = 'input.pdf'
|
||||
const PDF_TO_JPEG_OUTPUT_FILENAME = 'output.jpg'
|
||||
const PDF_TO_JPEG_OUTPUT_BASENAME = Path.basename(
|
||||
PDF_TO_JPEG_OUTPUT_FILENAME,
|
||||
'.jpg'
|
||||
)
|
||||
|
||||
async function convertToLaTeXWithLock(conversionId, inputPath, conversionType) {
|
||||
const conversionDir = Path.join(Settings.path.compilesDir, conversionId)
|
||||
const lock = LockManager.acquire(conversionDir)
|
||||
@@ -298,9 +310,76 @@ async function convertLaTeXToDocumentInDir(
|
||||
return Path.join(compileDir, finalOutputName)
|
||||
}
|
||||
|
||||
async function convertPDFToJPEGWithLock(conversionId, inputPath, mode) {
|
||||
const conversionDir = Path.join(Settings.path.compilesDir, conversionId)
|
||||
const lock = LockManager.acquire(conversionDir)
|
||||
try {
|
||||
return await convertPDFToJPEG(conversionId, conversionDir, inputPath, mode)
|
||||
} finally {
|
||||
lock.release()
|
||||
}
|
||||
}
|
||||
|
||||
async function convertPDFToJPEG(conversionId, conversionDir, inputPath, mode) {
|
||||
const config = PDF_TO_JPEG_CONFIGS[mode]
|
||||
await fs.mkdir(conversionDir, { recursive: true })
|
||||
const newSourcePath = Path.join(conversionDir, PDF_TO_JPEG_INPUT_FILENAME)
|
||||
await fs.copyFile(inputPath, newSourcePath)
|
||||
const dstPath = Path.join(conversionDir, PDF_TO_JPEG_OUTPUT_FILENAME)
|
||||
|
||||
try {
|
||||
const { stdout, stderr, exitCode } = await CommandRunner.promises.run(
|
||||
conversionId,
|
||||
[
|
||||
'pdftocairo',
|
||||
'-jpeg',
|
||||
'-jpegopt',
|
||||
`quality=${config.quality}`,
|
||||
'-singlefile',
|
||||
'-scale-to-x',
|
||||
config.width.toString(),
|
||||
'-scale-to-y',
|
||||
'-1', // maintain aspect ratio
|
||||
PDF_TO_JPEG_INPUT_FILENAME,
|
||||
PDF_TO_JPEG_OUTPUT_BASENAME,
|
||||
],
|
||||
conversionDir,
|
||||
Settings.pdftocairoImage,
|
||||
Settings.conversionTimeoutSeconds * 1000,
|
||||
{},
|
||||
'conversions',
|
||||
null
|
||||
)
|
||||
if (exitCode !== 0) {
|
||||
throw new OError('Non-zero exit code from pdftocairo', {
|
||||
exitCode,
|
||||
stderr,
|
||||
})
|
||||
}
|
||||
logger.debug(
|
||||
{ stdout, stderr, exitCode },
|
||||
'pdf-to-jpeg conversion completed'
|
||||
)
|
||||
|
||||
const stat = await fs.lstat(dstPath)
|
||||
if (!stat.isFile()) {
|
||||
throw new OError('output.jpg is not a regular file', { stat })
|
||||
}
|
||||
|
||||
// Clean up the source PDF to leave only the conversion result
|
||||
await fs.unlink(newSourcePath).catch(() => {})
|
||||
} catch (error) {
|
||||
await fs.rm(conversionDir, { force: true, recursive: true }).catch(() => {})
|
||||
throw new OError('pdf-to-jpeg conversion failed').withCause(error)
|
||||
}
|
||||
|
||||
return dstPath
|
||||
}
|
||||
|
||||
export default {
|
||||
promises: {
|
||||
convertToLaTeXWithLock,
|
||||
convertLaTeXToDocumentInDirWithLock,
|
||||
convertPDFToJPEGWithLock,
|
||||
},
|
||||
}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
clsi
|
||||
--data-dirs=cache,compiles,output
|
||||
--dependencies=
|
||||
--env-add=DOWNLOAD_HOST=http://clsi-nginx:8080,ALLOWED_COMPILE_GROUPS=clsi-perf simple-latex-file,ENABLE_PDF_CACHING=true,PDF_CACHING_ENABLE_WORKER_POOL=true,ALLOWED_IMAGES=quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9,TEXLIVE_IMAGE=quay.io/sharelatex/texlive-full:2025.1,TEX_LIVE_IMAGE_NAME_OVERRIDE=us-east1-docker.pkg.dev/overleaf-ops/ol-docker,TEXLIVE_IMAGE_USER=tex,SANDBOXED_COMPILES=true,SANDBOXED_COMPILES_HOST_DIR_COMPILES=$PWD/compiles,SANDBOXED_COMPILES_HOST_DIR_OUTPUT=$PWD/output,ENABLE_PANDOC_CONVERSIONS=true
|
||||
--env-add=DOWNLOAD_HOST=http://clsi-nginx:8080,ALLOWED_COMPILE_GROUPS=clsi-perf simple-latex-file,ENABLE_PDF_CACHING=true,PDF_CACHING_ENABLE_WORKER_POOL=true,ALLOWED_IMAGES=quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 quay.io/sharelatex/pdftocairo:24.02,TEXLIVE_IMAGE=quay.io/sharelatex/texlive-full:2025.1,TEX_LIVE_IMAGE_NAME_OVERRIDE=us-east1-docker.pkg.dev/overleaf-ops/ol-docker,TEXLIVE_IMAGE_USER=tex,SANDBOXED_COMPILES=true,SANDBOXED_COMPILES_HOST_DIR_COMPILES=$PWD/compiles,SANDBOXED_COMPILES_HOST_DIR_OUTPUT=$PWD/output,ENABLE_PANDOC_CONVERSIONS=true,ENABLE_PDF_CONVERSIONS=true
|
||||
--env-pass-through=
|
||||
--esmock-loader=False
|
||||
--node-version=24.14.1
|
||||
|
||||
@@ -31,6 +31,9 @@ module.exports = {
|
||||
parseInt(process.env.CLSI_CONVERSION_TIMEOUT_SECONDS, 10) || 60,
|
||||
pandocImage: process.env.PANDOC_IMAGE || 'quay.io/sharelatex/pandoc:3.9',
|
||||
enablePandocConversions: process.env.ENABLE_PANDOC_CONVERSIONS === 'true',
|
||||
pdftocairoImage:
|
||||
process.env.PDFTOCAIRO_IMAGE || 'quay.io/sharelatex/pdftocairo:24.02',
|
||||
enablePdfConversions: process.env.ENABLE_PDF_CONVERSIONS === 'true',
|
||||
maxUploadSize: 50 * 1024 * 1024,
|
||||
|
||||
internal: {
|
||||
|
||||
@@ -30,7 +30,7 @@ services:
|
||||
ALLOWED_COMPILE_GROUPS: clsi-perf simple-latex-file
|
||||
ENABLE_PDF_CACHING: true
|
||||
PDF_CACHING_ENABLE_WORKER_POOL: true
|
||||
ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9
|
||||
ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 quay.io/sharelatex/pdftocairo:24.02
|
||||
TEXLIVE_IMAGE: quay.io/sharelatex/texlive-full:2025.1
|
||||
TEX_LIVE_IMAGE_NAME_OVERRIDE: us-east1-docker.pkg.dev/overleaf-ops/ol-docker
|
||||
TEXLIVE_IMAGE_USER: tex
|
||||
@@ -38,6 +38,7 @@ services:
|
||||
SANDBOXED_COMPILES_HOST_DIR_COMPILES: $PWD/compiles
|
||||
SANDBOXED_COMPILES_HOST_DIR_OUTPUT: $PWD/output
|
||||
ENABLE_PANDOC_CONVERSIONS: true
|
||||
ENABLE_PDF_CONVERSIONS: true
|
||||
volumes:
|
||||
- ./reports:/overleaf/services/clsi/reports
|
||||
- ./compiles:/overleaf/services/clsi/compiles
|
||||
|
||||
@@ -53,7 +53,7 @@ services:
|
||||
ALLOWED_COMPILE_GROUPS: clsi-perf simple-latex-file
|
||||
ENABLE_PDF_CACHING: true
|
||||
PDF_CACHING_ENABLE_WORKER_POOL: true
|
||||
ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9
|
||||
ALLOWED_IMAGES: quay.io/sharelatex/texlive-full:2017.1 quay.io/sharelatex/texlive-full:2025.1 quay.io/sharelatex/pandoc:3.9 quay.io/sharelatex/pdftocairo:24.02
|
||||
TEXLIVE_IMAGE: quay.io/sharelatex/texlive-full:2025.1
|
||||
TEX_LIVE_IMAGE_NAME_OVERRIDE: us-east1-docker.pkg.dev/overleaf-ops/ol-docker
|
||||
TEXLIVE_IMAGE_USER: tex
|
||||
@@ -61,6 +61,7 @@ services:
|
||||
SANDBOXED_COMPILES_HOST_DIR_COMPILES: $PWD/compiles
|
||||
SANDBOXED_COMPILES_HOST_DIR_OUTPUT: $PWD/output
|
||||
ENABLE_PANDOC_CONVERSIONS: true
|
||||
ENABLE_PDF_CONVERSIONS: true
|
||||
depends_on:
|
||||
clsi-nginx:
|
||||
condition: service_started
|
||||
|
||||
@@ -23,6 +23,7 @@
|
||||
"@overleaf/promise-utils": "workspace:*",
|
||||
"@overleaf/settings": "workspace:*",
|
||||
"@overleaf/stream-utils": "workspace:*",
|
||||
"@overleaf/validation-tools": "workspace:*",
|
||||
"archiver": "5.3.2",
|
||||
"async": "^3.2.5",
|
||||
"body-parser": "1.20.4",
|
||||
|
||||
@@ -0,0 +1,83 @@
|
||||
import Client from './helpers/Client.js'
|
||||
import ClsiApp from './helpers/ClsiApp.js'
|
||||
import Path from 'node:path'
|
||||
import fs from 'node:fs/promises'
|
||||
import { promisify } from 'node:util'
|
||||
import { execFile as execFileCb } from 'node:child_process'
|
||||
import { expect } from 'chai'
|
||||
|
||||
const execFile = promisify(execFileCb)
|
||||
|
||||
const FIXTURE_PDF = Path.join(import.meta.dirname, '../fixtures/minimal.pdf')
|
||||
|
||||
const MODE_EXPECTATIONS = {
|
||||
preview: { width: 794 },
|
||||
thumbnail: { width: 190 },
|
||||
}
|
||||
|
||||
async function writeResponseToTempfile(response) {
|
||||
const buffer = Buffer.from(await response.arrayBuffer())
|
||||
const tmpPath = `/tmp/clsi-acceptance-pdf-to-jpeg-${crypto.randomUUID()}.jpg`
|
||||
await fs.writeFile(tmpPath, buffer)
|
||||
return { tmpPath, buffer }
|
||||
}
|
||||
|
||||
describe('pdf-to-jpeg conversion', function () {
|
||||
before(async function () {
|
||||
await ClsiApp.ensureRunning()
|
||||
})
|
||||
|
||||
for (const [mode, { width: expectedWidth }] of Object.entries(
|
||||
MODE_EXPECTATIONS
|
||||
)) {
|
||||
describe(`with mode=${mode}`, function () {
|
||||
let response
|
||||
let tmpPath
|
||||
let buffer
|
||||
|
||||
before(async function () {
|
||||
response = await Client.convertPdfToJpeg(FIXTURE_PDF, mode)
|
||||
expect(response.status).to.equal(200)
|
||||
;({ tmpPath, buffer } = await writeResponseToTempfile(response))
|
||||
})
|
||||
|
||||
after(async function () {
|
||||
if (tmpPath) {
|
||||
await fs.unlink(tmpPath).catch(() => {})
|
||||
}
|
||||
})
|
||||
|
||||
it('returns a JPEG (per `file`)', async function () {
|
||||
const { stdout } = await execFile('file', ['--brief', tmpPath])
|
||||
expect(stdout).to.match(/JPEG image data/)
|
||||
})
|
||||
|
||||
it(`has the expected width of ${expectedWidth}px`, async function () {
|
||||
const { stdout } = await execFile('identify', [
|
||||
'-format',
|
||||
'%w %h',
|
||||
tmpPath,
|
||||
])
|
||||
const [width, height] = stdout.trim().split(' ').map(Number)
|
||||
expect(width).to.equal(expectedWidth)
|
||||
// A4 portrait is taller than wide; height must be positive and
|
||||
// larger than the width (so the aspect ratio was preserved).
|
||||
expect(height).to.be.greaterThan(width)
|
||||
})
|
||||
|
||||
it('returns a non-empty body matching Content-Length', function () {
|
||||
expect(buffer.length).to.be.greaterThan(0)
|
||||
expect(buffer.length).to.equal(
|
||||
Number(response.headers.get('content-length'))
|
||||
)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
describe('with an unsupported mode', function () {
|
||||
it('returns 400', async function () {
|
||||
const response = await Client.convertPdfToJpeg(FIXTURE_PDF, 'not-a-mode')
|
||||
expect(response.status).to.equal(400)
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -53,6 +53,16 @@ async function convertDocument(path, type) {
|
||||
}
|
||||
}
|
||||
|
||||
async function convertPdfToJpeg(path, mode) {
|
||||
const formData = new FormData()
|
||||
formData.append('qqfile', await fsPromises.readFile(path), 'input.pdf')
|
||||
return await fetch(`${host}/convert/pdf-to-jpeg?mode=${mode}`, {
|
||||
method: 'POST',
|
||||
headers: formData.getHeaders(),
|
||||
body: formData.getBuffer(),
|
||||
})
|
||||
}
|
||||
|
||||
async function convertProjectToDocument(
|
||||
projectId,
|
||||
userId,
|
||||
@@ -239,6 +249,7 @@ export default {
|
||||
compile,
|
||||
convertProjectToDocument,
|
||||
convertDocument,
|
||||
convertPdfToJpeg,
|
||||
stopCompile,
|
||||
clearCache,
|
||||
getOutputFile,
|
||||
|
||||
@@ -18,6 +18,7 @@ describe('ConversionController', function () {
|
||||
ctx.documentStat = { size: 5678 }
|
||||
ctx.Settings = {
|
||||
enablePandocConversions: true,
|
||||
enablePdfConversions: true,
|
||||
path: {
|
||||
compilesDir: '/compiles',
|
||||
outputDir: '/output',
|
||||
|
||||
Reference in New Issue
Block a user