a9a9f6ee6b
* migrate recover_zip_from_backup from archiver to zip-stream Replace the `archiver` package with `zip-stream` (the lower-level library that `archiver` wraps) in the `recover_zip_from_backup.mjs` script and `backupArchiver.mjs` library. The `archiver` package has known issues with hanging when creating large zip files and is no longer actively maintained. Changes: - Add `zip-stream@^7.0.2` as a direct dependency - Update `backupArchiver.mjs` to use promisified `ZipStream.entry()` instead of `Archiver.append()` - Rewrite `recover_zip_from_backup.mjs` to use `ZipStream` with `stream/promises.pipeline` for cleaner async flow - Keep `archiver` dependency for `project_archive.js` (separate code path) Agent-Logs-Url: https://github.com/overleaf/internal/sessions/0df27a8b-97f1-43cc-ac26-f5247a84313f Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> * extract finalize timeout to named constant Agent-Logs-Url: https://github.com/overleaf/internal/sessions/0df27a8b-97f1-43cc-ac26-f5247a84313f Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> * convert recover_zip.js to zip-stream, remove finalize timeout, add verbose logging Agent-Logs-Url: https://github.com/overleaf/internal/sessions/9380d08a-d813-4e9f-a2ac-4891122c163b Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> * add acceptance tests for recover_zip_from_backup in raw and latest modes Agent-Logs-Url: https://github.com/overleaf/internal/sessions/9380d08a-d813-4e9f-a2ac-4891122c163b Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> * fix comment formatting in recover_zip_from_backup.mjs Agent-Logs-Url: https://github.com/overleaf/internal/sessions/9380d08a-d813-4e9f-a2ac-4891122c163b Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> * restore EventEmitter.defaultMaxListeners in recover_zip.js, add acceptance test Agent-Logs-Url: https://github.com/overleaf/internal/sessions/e7443126-22d5-4d0e-a176-a7a5dba49ffd Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> * fix formatting * refactor: simplify stream handling by using named imports for pipeline * fix blob hash verification in backup acceptance tests * fix recover_zip script and tests * fix: exit with non-zero status on error in recover_zip.js Agent-Logs-Url: https://github.com/overleaf/internal/sessions/ef3f109b-488f-47c9-84a5-b5269387166a Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> * migrate from npm to yarn --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: briangough <7457354+briangough@users.noreply.github.com> Co-authored-by: Brian Gough <briangough@users.noreply.github.com> GitOrigin-RevId: 6255f9610f3c846790e2ed8b1979ac08b7effece
329 lines
9.2 KiB
JavaScript
329 lines
9.2 KiB
JavaScript
/**
|
|
* Try to recover a zip of the latest version of a project using only data in
|
|
* GCS, where this data may have been (recently) hard deleted (i.e. may exist
|
|
* wholely or in part as non-current versions). This should be able to
|
|
* retrieve the latest content of a project up to 180 days after it was
|
|
* deleted.
|
|
*
|
|
* Usage:
|
|
* node recover_zip.js [--verbose] <HISTORY_ID> <HISTORY_ID> ...
|
|
*
|
|
* Output:
|
|
* Signed URL(s) for the uploaded zip files. Note that these are valid for
|
|
* only 24h, to match the lifecycle rule on the zip bucket.
|
|
*/
|
|
|
|
const fs = require('node:fs')
|
|
const os = require('node:os')
|
|
const path = require('node:path')
|
|
const util = require('node:util')
|
|
const { pipeline } = require('node:stream/promises')
|
|
|
|
// Something is registering 11 listeners, over the limit
|
|
// of 10, which generates a lot of warning noise.
|
|
require('node:events').EventEmitter.defaultMaxListeners = 11
|
|
|
|
const config = require('config')
|
|
// We depend on this via object-persistor.
|
|
// eslint-disable-next-line import/no-extraneous-dependencies
|
|
const { Storage } = require('@google-cloud/storage')
|
|
const isValidUtf8 = require('utf-8-validate')
|
|
// zip-stream@7 uses ESM default export
|
|
const ZipStream = require('zip-stream').default
|
|
|
|
function createStorage() {
|
|
const opts = {}
|
|
if (config.has('persistor.gcs.endpoint.apiEndpoint')) {
|
|
opts.apiEndpoint = config.get('persistor.gcs.endpoint.apiEndpoint')
|
|
}
|
|
if (config.has('persistor.gcs.endpoint.projectId')) {
|
|
opts.projectId = config.get('persistor.gcs.endpoint.projectId')
|
|
}
|
|
return new Storage(opts)
|
|
}
|
|
|
|
const core = require('overleaf-editor-core')
|
|
const projectKey = require('@overleaf/object-persistor/src/ProjectKey.js')
|
|
const streams = require('../lib/streams')
|
|
|
|
const {
|
|
values: { verbose: VERBOSE },
|
|
positionals: HISTORY_IDS,
|
|
} = util.parseArgs({
|
|
options: {
|
|
verbose: {
|
|
type: 'boolean',
|
|
default: false,
|
|
},
|
|
},
|
|
allowPositionals: true,
|
|
})
|
|
|
|
if (HISTORY_IDS.length === 0) {
|
|
console.error('no history IDs; see usage')
|
|
process.exit(1)
|
|
}
|
|
|
|
async function listDeletedChunks(historyId) {
|
|
const bucketName = config.get('chunkStore.bucket')
|
|
const storage = createStorage()
|
|
const [files] = await storage.bucket(bucketName).getFiles({
|
|
prefix: projectKey.format(historyId),
|
|
versions: true,
|
|
})
|
|
return files
|
|
}
|
|
|
|
async function findLatestChunk(historyId) {
|
|
const files = await listDeletedChunks(historyId)
|
|
if (files.length === 0) return null
|
|
files.sort((a, b) => {
|
|
if (a.name < b.name) return -1
|
|
if (a.name > b.name) return 1
|
|
return 0
|
|
})
|
|
return files[files.length - 1]
|
|
}
|
|
|
|
async function downloadLatestChunk(tmp, historyId) {
|
|
const latestChunkFile = await findLatestChunk(historyId)
|
|
if (!latestChunkFile) throw new Error('no chunk found to recover')
|
|
|
|
const destination = path.join(tmp, 'latest.json')
|
|
await latestChunkFile.download({ destination })
|
|
return destination
|
|
}
|
|
|
|
async function loadHistory(historyPathname) {
|
|
const data = await fs.promises.readFile(historyPathname)
|
|
const rawHistory = JSON.parse(data)
|
|
return core.History.fromRaw(rawHistory)
|
|
}
|
|
|
|
async function loadChunk(historyPathname, blobStore) {
|
|
const history = await loadHistory(historyPathname)
|
|
|
|
const blobHashes = new Set()
|
|
history.findBlobHashes(blobHashes)
|
|
|
|
await blobStore.fetchBlobs(blobHashes)
|
|
await history.loadFiles('lazy', blobStore)
|
|
|
|
return new core.Chunk(history, 0)
|
|
}
|
|
|
|
// TODO: it would be nice to export / expose this from BlobStore;
|
|
// currently this is a copy of the method there.
|
|
async function getStringLengthOfFile(byteLength, pathname) {
|
|
// We have to read the file into memory to get its UTF-8 length, so don't
|
|
// bother for files that are too large for us to edit anyway.
|
|
if (byteLength > core.Blob.MAX_EDITABLE_BYTE_LENGTH_BOUND) {
|
|
return null
|
|
}
|
|
|
|
// We need to check if the file contains nonBmp or null characters
|
|
let data = await fs.promises.readFile(pathname)
|
|
if (!isValidUtf8(data)) return null
|
|
data = data.toString()
|
|
if (data.length > core.TextOperation.MAX_STRING_LENGTH) return null
|
|
if (core.util.containsNonBmpChars(data)) return null
|
|
if (data.indexOf('\x00') !== -1) return null
|
|
return data.length
|
|
}
|
|
|
|
class RecoveryBlobStore {
|
|
constructor(historyId, tmp) {
|
|
this.historyId = historyId
|
|
this.tmp = tmp
|
|
this.blobs = new Map()
|
|
}
|
|
|
|
async fetchBlobs(blobHashes) {
|
|
for await (const blobHash of blobHashes) {
|
|
await this.fetchBlob(blobHash)
|
|
}
|
|
}
|
|
|
|
async fetchBlob(hash) {
|
|
if (this.blobs.has(hash)) return
|
|
|
|
if (VERBOSE) console.log('fetching blob', hash)
|
|
|
|
const bucketName = config.get('blobStore.projectBucket')
|
|
const storage = createStorage()
|
|
const [files] = await storage.bucket(bucketName).getFiles({
|
|
prefix: this.makeProjectBlobKey(hash),
|
|
versions: true,
|
|
})
|
|
|
|
const destination = this.getBlobPathname(hash)
|
|
|
|
if (files.length === 0) {
|
|
await this.fetchGlobalBlob(hash, destination)
|
|
} else if (files.length === 1) {
|
|
await files[0].download({ destination })
|
|
} else {
|
|
throw new Error('Multiple versions of blob ' + hash)
|
|
}
|
|
|
|
this.blobs.set(hash, await this.makeBlob(hash, destination))
|
|
}
|
|
|
|
async fetchGlobalBlob(hash, destination) {
|
|
const bucketName = config.get('blobStore.globalBucket')
|
|
const storage = createStorage()
|
|
const file = storage.bucket(bucketName).file(this.makeGlobalBlobKey(hash))
|
|
await file.download({ destination })
|
|
}
|
|
|
|
async makeBlob(hash, pathname) {
|
|
const stat = await fs.promises.stat(pathname)
|
|
const byteLength = stat.size
|
|
const stringLength = await getStringLengthOfFile(byteLength, pathname)
|
|
return new core.Blob(hash, byteLength, stringLength)
|
|
}
|
|
|
|
async getString(hash) {
|
|
const stream = await this.getStream(hash)
|
|
const buffer = await streams.readStreamToBuffer(stream)
|
|
return buffer.toString()
|
|
}
|
|
|
|
async getStream(hash) {
|
|
return fs.createReadStream(this.getBlobPathname(hash))
|
|
}
|
|
|
|
async getBlob(hash) {
|
|
return this.blobs.get(hash)
|
|
}
|
|
|
|
getBlobPathname(hash) {
|
|
return path.join(this.tmp, hash)
|
|
}
|
|
|
|
makeGlobalBlobKey(hash) {
|
|
return `${hash.slice(0, 2)}/${hash.slice(2, 4)}/${hash.slice(4)}`
|
|
}
|
|
|
|
makeProjectBlobKey(hash) {
|
|
return `${projectKey.format(this.historyId)}/${hash.slice(
|
|
0,
|
|
2
|
|
)}/${hash.slice(2)}`
|
|
}
|
|
}
|
|
|
|
async function uploadZip(historyId, zipPathname) {
|
|
const bucketName = config.get('zipStore.bucket')
|
|
const deadline = 24 * 3600 * 1000 // lifecycle limit on the zips bucket
|
|
const storage = createStorage()
|
|
const destination = `${historyId}-recovered.zip`
|
|
await storage.bucket(bucketName).upload(zipPathname, {
|
|
destination,
|
|
resumable: false,
|
|
})
|
|
|
|
if (config.has('persistor.gcs.endpoint.apiEndpoint')) {
|
|
// In emulator mode, signed URLs aren't available
|
|
const apiEndpoint = config.get('persistor.gcs.endpoint.apiEndpoint')
|
|
return `${apiEndpoint}/storage/v1/b/${bucketName}/o/${encodeURIComponent(destination)}?alt=media`
|
|
}
|
|
|
|
const signedUrls = await storage
|
|
.bucket(bucketName)
|
|
.file(destination)
|
|
.getSignedUrl({
|
|
version: 'v4',
|
|
action: 'read',
|
|
expires: Date.now() + deadline,
|
|
})
|
|
|
|
return signedUrls[0]
|
|
}
|
|
|
|
/**
|
|
* Promisified wrapper for ZipStream's entry method.
|
|
*
|
|
* @param {ZipStream} archive
|
|
* @param {Buffer|NodeJS.ReadableStream|string} source
|
|
* @param {{ name: string }} data
|
|
* @return {Promise<void>}
|
|
*/
|
|
function addEntry(archive, source, data) {
|
|
return new Promise((resolve, reject) => {
|
|
archive.entry(source, data, err => {
|
|
if (err) reject(err)
|
|
else resolve()
|
|
})
|
|
})
|
|
}
|
|
|
|
async function restoreProject(historyId) {
|
|
const tmp = await fs.promises.mkdtemp(
|
|
path.join(os.tmpdir(), historyId.toString())
|
|
)
|
|
if (VERBOSE) console.log('recovering', historyId, 'in', tmp)
|
|
|
|
const latestJsonPathname = await downloadLatestChunk(tmp, historyId)
|
|
const blobStore = new RecoveryBlobStore(historyId, tmp)
|
|
const chunk = await loadChunk(latestJsonPathname, blobStore)
|
|
|
|
const snapshot = chunk.getSnapshot()
|
|
for (const change of chunk.getChanges()) {
|
|
change.applyTo(snapshot)
|
|
}
|
|
|
|
if (VERBOSE) console.log('zipping', historyId)
|
|
|
|
const zipPathname = path.join(tmp, `${historyId}.zip`)
|
|
const outputFile = fs.createWriteStream(zipPathname)
|
|
const archive = new ZipStream()
|
|
|
|
const pipelinePromise = pipeline(archive, outputFile)
|
|
|
|
for (const pathname of snapshot.getFilePathnames()) {
|
|
const file = snapshot.getFile(pathname)
|
|
if (!file) continue
|
|
|
|
await file.load('eager', blobStore)
|
|
let content = file.getContent({
|
|
filterTrackedDeletes: true,
|
|
})
|
|
|
|
if (content === null) {
|
|
const hash = file.getHash()
|
|
content = await blobStore.getStream(hash)
|
|
}
|
|
|
|
if (content == null) continue
|
|
|
|
if (typeof content === 'string') {
|
|
content = Buffer.from(content)
|
|
}
|
|
await addEntry(archive, content, { name: pathname })
|
|
if (VERBOSE) console.log(`${pathname} added`)
|
|
}
|
|
|
|
archive.finalize()
|
|
await pipelinePromise
|
|
|
|
if (VERBOSE) {
|
|
console.log(`Wrote ${archive.getBytesWritten()} bytes`)
|
|
}
|
|
|
|
if (VERBOSE) console.log('uploading', historyId)
|
|
|
|
return await uploadZip(historyId, zipPathname)
|
|
}
|
|
|
|
async function main() {
|
|
for (const historyId of HISTORY_IDS) {
|
|
const signedUrl = await restoreProject(historyId)
|
|
console.log(signedUrl)
|
|
}
|
|
}
|
|
main().catch(err => {
|
|
console.error(err)
|
|
process.exit(1)
|
|
})
|