From 83b6b323c3f9af071d248e5037e3bbc0198591c6 Mon Sep 17 00:00:00 2001 From: claude Date: Tue, 2 Jun 2026 13:14:47 +0000 Subject: [PATCH] Add cv2/tqdm to base; implement per-project Python venvs (Design B, Phase 1) Base image: add opencv-python-headless (cv2) and tqdm to the bundled scientific stack, and python3-venv (needed to build per-project venvs). Per-project dependencies: a project's requirements.txt is now installed into a venv cached by its sha256 (python3 -m venv --system-site-packages, so the bundled stack stays visible and only extra packages are installed); QuartoRunner points Quarto at it via QUARTO_PYTHON. A per-hash flock serialises concurrent builds; pip output is merged into output.log; on failure the render falls back to the base interpreter. Venvs live under PYTHON_VENVS_DIR (default /var/lib/overleaf/data/python-venvs). Gating: PythonVenvGate.userCanInstallPython restricts installs to the project owner + invited collaborators (ignorePublicAccess excludes anonymous/link users), threaded to CLSI as allowPythonInstall on the editor compile, presentation export, and publish paths. Behind OVERLEAF_ENABLE_PROJECT_PYTHON_VENV (enabled in the deployment). Design doc updated; Phase 2 (egress policy) and Phase 3 (venv eviction) remain. Co-Authored-By: Claude Opus 4.8 --- .gitea/workflows/deploy-verso.yml | 6 ++ docs/python-dependencies-design.md | 28 +++++++++- server-ce/Dockerfile-base | 12 ++-- services/clsi/app/js/CompileManager.js | 1 + services/clsi/app/js/QuartoRunner.js | 56 ++++++++++++++++++- services/clsi/app/js/RequestParser.js | 7 +++ .../app/src/Features/Compile/ClsiManager.mjs | 1 + .../Features/Compile/CompileController.mjs | 6 ++ .../Compile/PresentationExportController.mjs | 2 + .../src/Features/Compile/PythonVenvGate.mjs | 33 +++++++++++ .../PublishedPresentationManager.mjs | 2 + services/web/config/settings.defaults.js | 6 ++ 12 files changed, 152 insertions(+), 8 deletions(-) create mode 100644 services/web/app/src/Features/Compile/PythonVenvGate.mjs diff --git a/.gitea/workflows/deploy-verso.yml b/.gitea/workflows/deploy-verso.yml index 9d3787ab9b..4d7e83b5ca 100644 --- a/.gitea/workflows/deploy-verso.yml +++ b/.gitea/workflows/deploy-verso.yml @@ -290,6 +290,12 @@ jobs: # need OVERLEAF_ALLOW_PUBLIC_ACCESS above. - name: OVERLEAF_ALLOW_ANONYMOUS_READ_AND_WRITE_SHARING value: "true" + # Let Quarto Python cells use a project's requirements.txt: + # the compiler installs it into a cached venv. Gated to the + # project owner + invited collaborators (never anonymous / + # link-sharing users). + - name: OVERLEAF_ENABLE_PROJECT_PYTHON_VENV + value: "true" --- apiVersion: v1 kind: Service diff --git a/docs/python-dependencies-design.md b/docs/python-dependencies-design.md index 14ee40e8c1..28695af792 100644 --- a/docs/python-dependencies-design.md +++ b/docs/python-dependencies-design.md @@ -1,7 +1,31 @@ # Design: per-project Python dependencies (cached virtualenv) -Status: **proposal** (not yet implemented). Captures the agreed plan for letting -Quarto `{python}` cells use libraries beyond the curated base set. +Status: **Phase 1 implemented** (gated behind `OVERLEAF_ENABLE_PROJECT_PYTHON_VENV`, +on in the deployment). Network egress policy and venv eviction (Phases 2–3) +remain. Captures the plan for letting Quarto `{python}` cells use libraries +beyond the curated base set. + +## What ships in Phase 1 + +- A project root `requirements.txt` is installed into a venv cached by its + sha256, created with `python3 -m venv --system-site-packages`; `QuartoRunner` + points Quarto at it via `QUARTO_PYTHON`. A per-hash `flock` serialises + concurrent builds; pip output is merged into `output.log`; on failure the + render falls back to the base interpreter (and the missing-package message + surfaces). Venvs live under `PYTHON_VENVS_DIR` + (default `/var/lib/overleaf/data/python-venvs`). +- Gated by `userCanInstallPython` (`PythonVenvGate.mjs`) to the project owner + + invited collaborators (any role) — never anonymous / link-sharing users — + threaded to CLSI as `allowPythonInstall` on the editor compile, presentation + export, and publish paths. + +### Known Phase-1 limitations + +- The first build of a heavy `requirements.txt` runs within the compile + timeout; a very large install can be killed and retried next compile (the + venv is only marked complete on success). +- No egress restriction yet (Phase 2) — installs reach PyPI directly. +- No eviction yet (Phase 3) — venvs accumulate under `PYTHON_VENVS_DIR`. ## Background diff --git a/server-ce/Dockerfile-base b/server-ce/Dockerfile-base index c38dd498e2..24bef08a03 100644 --- a/server-ce/Dockerfile-base +++ b/server-ce/Dockerfile-base @@ -81,17 +81,21 @@ RUN mkdir -p /opt/quarto-extensions \ # managed (PEP 668), hence --break-system-packages in this controlled image. # The runtime user (www-data) writes Jupyter's runtime/connection files under # its HOME (/var/www/.local), which is made writable in the Quarto step above. +# python3-venv is needed so a project's requirements.txt can be installed into +# a per-project venv (see QuartoRunner / PythonVenvGate). RUN apt-get update \ -&& apt-get install -y python3-pip \ +&& apt-get install -y python3-pip python3-venv \ && pip3 install --no-cache-dir --break-system-packages \ jupyter-core jupyter-client nbclient nbformat ipykernel pyyaml \ && python3 -m ipykernel install --prefix /usr/local --name python3 --display-name "Python 3" \ # Bundle the common scientific-Python stack so most decks "just work" without -# any per-project install. matplotlib renders headless (Agg) automatically. -# To add more later, append to this list (it is the cheapest way to cover a -# library many projects need). +# any per-project install. matplotlib renders headless (Agg) automatically; +# opencv-python-headless is the GUI-less OpenCV build (provides cv2) suited to +# a server. To add more later, append to this list (the cheapest way to cover +# a library many projects need). && pip3 install --no-cache-dir --break-system-packages \ numpy pandas scipy matplotlib seaborn scikit-learn sympy plotly tabulate \ + opencv-python-headless tqdm \ && rm -rf /var/lib/apt/lists/* /root/.cache # Install decktape + headless Chromium (for exporting RevealJS decks to PDF) diff --git a/services/clsi/app/js/CompileManager.js b/services/clsi/app/js/CompileManager.js index fe6472a546..df2d7fffbf 100644 --- a/services/clsi/app/js/CompileManager.js +++ b/services/clsi/app/js/CompileManager.js @@ -248,6 +248,7 @@ async function doCompile(request, stats, timings) { compileGroup: request.compileGroup, stopOnFirstError: request.stopOnFirstError, exportMode: request.exportMode, + allowPythonInstall: request.allowPythonInstall, stats, timings, }) diff --git a/services/clsi/app/js/QuartoRunner.js b/services/clsi/app/js/QuartoRunner.js index 8cc2ffba0d..5a762656f0 100644 --- a/services/clsi/app/js/QuartoRunner.js +++ b/services/clsi/app/js/QuartoRunner.js @@ -26,7 +26,16 @@ function runQuarto(compileName, options, callback) { renderTarget = _writeStandaloneVariant(directory, mainFile) } - const command = _buildQuartoCommand(renderTarget, options.exportMode) + // Where cached per-project venvs live (shared across projects, keyed by the + // requirements.txt hash). Must be on a persistent volume in production. + const venvBaseDir = + process.env.PYTHON_VENVS_DIR || '/var/lib/overleaf/data/python-venvs' + const command = _buildQuartoCommand( + renderTarget, + options.exportMode, + Boolean(options.allowPythonInstall), + venvBaseDir + ) ProcessTable[compileName] = CommandRunner.run( compileName, @@ -60,7 +69,12 @@ function runQuarto(compileName, options, callback) { ) } -function _buildQuartoCommand(renderTarget, exportMode) { +function _buildQuartoCommand( + renderTarget, + exportMode, + allowPythonInstall, + venvBaseDir +) { // Run through a POSIX shell so stderr is merged into stdout (2>&1). // LocalCommandRunner replaces $COMPILE_DIR before the shell sees it. // @@ -121,15 +135,53 @@ function _buildQuartoCommand(renderTarget, exportMode) { ? `; rm -rf ${baseName}.qmd ${baseName}_files` : '' + const venvPrep = allowPythonInstall ? _pythonVenvPrep(venvBaseDir) : '' + const cmd = `mkdir -p _extensions && ` + `cp -rn /opt/quarto-extensions/_extensions/. _extensions/ 2>/dev/null; ` + + venvPrep + `quarto render ${inputPath} 2>&1 && ` + tail + cleanup return ['/bin/sh', '-c', cmd] } +// Shell snippet (run before `quarto render`, in the compile dir) that installs +// a project's requirements.txt into a venv cached by the file's sha256 and +// points Quarto at it via QUARTO_PYTHON. Notes: +// - The venv is shared across projects/compiles (keyed by content hash), so +// identical dependency sets are built once. +// - --system-site-packages keeps the bundled scientific stack + ipykernel +// visible, so only the *extra* packages are installed. +// - A per-hash flock serialises concurrent compiles building the same venv. +// - Everything is merged to stdout so pip output/errors land in output.log; +// on failure QUARTO_PYTHON is left unset and the render falls back to the +// base interpreter (the missing-package error then surfaces normally). +// - Only $-shell vars / $(...) are used (no ${...}) to avoid clashing with +// JS template interpolation; only ${venvBaseDir} is substituted by JS. +function _pythonVenvPrep(venvBaseDir) { + return ( + `if [ -f requirements.txt ]; then ` + + `VBASE="${venvBaseDir}"; ` + + `RHASH=$(sha256sum requirements.txt 2>/dev/null | cut -d" " -f1); ` + + `if [ -n "$RHASH" ]; then ` + + `VDIR="$VBASE/$RHASH"; mkdir -p "$VBASE" 2>/dev/null; ` + + `( flock 9 || exit 0; ` + + `if [ ! -f "$VDIR/.verso-complete" ]; then ` + + `echo "Installing Python packages from requirements.txt..."; rm -rf "$VDIR"; ` + + `python3 -m venv --system-site-packages "$VDIR" ` + + `&& "$VDIR/bin/pip" install --no-input --disable-pip-version-check -r requirements.txt ` + + `&& touch "$VDIR/.verso-complete" ` + + `|| echo "ERROR: Failed to install Python packages from requirements.txt"; ` + + `fi ` + + `) 9>"$VBASE/.$RHASH.lock" 2>&1; ` + + `if [ -f "$VDIR/.verso-complete" ]; then export QUARTO_PYTHON="$VDIR/bin/python3"; fi; ` + + `fi; ` + + `fi; ` + ) +} + // Write a temporary copy of the root .qmd with embed-resources enabled in its // frontmatter, returning the temp filename to render. On any problem (no // frontmatter, not a nested revealjs deck, read/write error) it falls back to diff --git a/services/clsi/app/js/RequestParser.js b/services/clsi/app/js/RequestParser.js index 5e77715da6..ad1db87fa6 100644 --- a/services/clsi/app/js/RequestParser.js +++ b/services/clsi/app/js/RequestParser.js @@ -109,6 +109,13 @@ function parse(body, callback) { compile.options.exportMode, { default: '', type: 'string' } ) + // Verso: whether QuartoRunner may install the project's requirements.txt + // into a cached venv (gated by privilege on the web side). + response.allowPythonInstall = _parseAttribute( + 'allowPythonInstall', + compile.options.allowPythonInstall, + { default: false, type: 'boolean' } + ) response.flags = _parseAttribute('flags', compile.options.flags, { default: [], type: 'object', diff --git a/services/web/app/src/Features/Compile/ClsiManager.mjs b/services/web/app/src/Features/Compile/ClsiManager.mjs index 8c950cc34e..9493664cb6 100644 --- a/services/web/app/src/Features/Compile/ClsiManager.mjs +++ b/services/web/app/src/Features/Compile/ClsiManager.mjs @@ -1149,6 +1149,7 @@ function _finaliseRequest(projectId, options, project, docs, files) { draft: Boolean(options.draft), stopOnFirstError: Boolean(options.stopOnFirstError), exportMode: options.exportMode, + allowPythonInstall: Boolean(options.allowPythonInstall), check: options.check, syncType: options.syncType, syncState: options.syncState, diff --git a/services/web/app/src/Features/Compile/CompileController.mjs b/services/web/app/src/Features/Compile/CompileController.mjs index ef176598cf..41c2cf42ae 100644 --- a/services/web/app/src/Features/Compile/CompileController.mjs +++ b/services/web/app/src/Features/Compile/CompileController.mjs @@ -7,6 +7,7 @@ import logger from '@overleaf/logger' import Settings from '@overleaf/settings' import Errors from '../Errors/Errors.js' import SessionManager from '../Authentication/SessionManager.mjs' +import { userCanInstallPython } from './PythonVenvGate.mjs' import { RateLimiter } from '../../infrastructure/RateLimiter.mjs' import Validation from '../../infrastructure/Validation.mjs' import Path from 'node:path' @@ -201,6 +202,11 @@ const _CompileController = { options.incrementalCompilesEnabled = true } + // Allow building a per-project Python venv from requirements.txt only for + // the project owner and invited collaborators — never anonymous or + // link-sharing users. + options.allowPythonInstall = await userCanInstallPython(userId, projectId) + let { enablePdfCaching, pdfCachingMinChunkSize, diff --git a/services/web/app/src/Features/Compile/PresentationExportController.mjs b/services/web/app/src/Features/Compile/PresentationExportController.mjs index 844ed1cba1..b811eede8a 100644 --- a/services/web/app/src/Features/Compile/PresentationExportController.mjs +++ b/services/web/app/src/Features/Compile/PresentationExportController.mjs @@ -6,6 +6,7 @@ import SessionManager from '../Authentication/SessionManager.mjs' import CompileManager from './CompileManager.mjs' import ClsiManager from './ClsiManager.mjs' import ProjectGetter from '../Project/ProjectGetter.mjs' +import { userCanInstallPython } from './PythonVenvGate.mjs' // On-demand export of a RevealJS deck from the editor's download menu. // - html → a single self-contained .html (embed-resources) @@ -61,6 +62,7 @@ async function exportPresentation(req, res) { await CompileManager.promises.compile(projectId, userId, { exportMode: format.exportMode, bypassRecentCompileCheck: true, + allowPythonInstall: await userCanInstallPython(userId, projectId), }) if (!buildId || !outputFiles?.some(f => f.path === format.file)) { diff --git a/services/web/app/src/Features/Compile/PythonVenvGate.mjs b/services/web/app/src/Features/Compile/PythonVenvGate.mjs new file mode 100644 index 0000000000..fb890ef28b --- /dev/null +++ b/services/web/app/src/Features/Compile/PythonVenvGate.mjs @@ -0,0 +1,33 @@ +import Settings from '@overleaf/settings' +import logger from '@overleaf/logger' +import AuthorizationManager from '../Authorization/AuthorizationManager.mjs' + +// Whether this user may have the compiler install a project's requirements.txt +// into a cached venv (so Quarto's Python cells can use libraries beyond the +// bundled base set). Gated to the project owner + invited collaborators (any +// role): ignorePublicAccess excludes link-sharing/public and anonymous users, +// who fall back to the base Python interpreter. Returns false when the feature +// is disabled or the privilege check fails. +export async function userCanInstallPython(userId, projectId) { + if (!Settings.enableProjectPythonVenv) { + return false + } + try { + const privilegeLevel = + await AuthorizationManager.promises.getPrivilegeLevelForProject( + userId, + projectId, + null, + { ignorePublicAccess: true } + ) + return Boolean(privilegeLevel) + } catch (err) { + logger.warn( + { err, projectId, userId }, + 'could not determine python install privilege; defaulting to false' + ) + return false + } +} + +export default { userCanInstallPython } diff --git a/services/web/app/src/Features/PublishedPresentation/PublishedPresentationManager.mjs b/services/web/app/src/Features/PublishedPresentation/PublishedPresentationManager.mjs index 6541e6093a..d854af7a9f 100644 --- a/services/web/app/src/Features/PublishedPresentation/PublishedPresentationManager.mjs +++ b/services/web/app/src/Features/PublishedPresentation/PublishedPresentationManager.mjs @@ -8,6 +8,7 @@ import { fetchStream } from '@overleaf/fetch-utils' import { callbackify } from 'node:util' import CompileManager from '../Compile/CompileManager.mjs' import { getOutputFileURL } from '../Compile/ClsiURLHelpers.mjs' +import { userCanInstallPython } from '../Compile/PythonVenvGate.mjs' import { PublishedPresentation } from '../../models/PublishedPresentation.mjs' import Errors from '../Errors/Errors.js' @@ -56,6 +57,7 @@ async function publish(projectId, userId) { const { status, outputFiles, clsiServerId, buildId } = await CompileManager.promises.compile(projectId, userId, { bypassRecentCompileCheck: true, + allowPythonInstall: await userCanInstallPython(userId, projectId), }) if (!outputFiles?.some(f => f.path === 'output.html')) { diff --git a/services/web/config/settings.defaults.js b/services/web/config/settings.defaults.js index 341e49c4c0..317eaca13f 100644 --- a/services/web/config/settings.defaults.js +++ b/services/web/config/settings.defaults.js @@ -478,6 +478,12 @@ module.exports = { : 'quarto', enableSubscriptions: false, restrictedCountries: [], + + // When true, a project's requirements.txt is installed into a cached venv so + // Quarto's Python cells can use libraries beyond the bundled base set. Gated + // in CompileController to the project owner + invited collaborators only. + enableProjectPythonVenv: + process.env.OVERLEAF_ENABLE_PROJECT_PYTHON_VENV === 'true', enableOnboardingEmails: process.env.ENABLE_ONBOARDING_EMAILS === 'true', enabledLinkedFileTypes: (process.env.ENABLED_LINKED_FILE_TYPES || '').split(