Merge pull request #30045 from overleaf/bg-object-persistor-make-list-directory-safer

Improve safety of object persistor

GitOrigin-RevId: bced9814de6613b388ca288a5f72cd42cff6c1d3
This commit is contained in:
Brian Gough
2025-12-05 08:53:54 +00:00
committed by Copybot
parent 6f34fa869d
commit 7a57ef00cb
10 changed files with 550 additions and 40 deletions
@@ -177,4 +177,21 @@ module.exports = class AbstractPersistor {
prefix,
})
}
/**
* List objects in a directory, returning key and size information.
*
* Suitable only for directories where the number of keys is known to be small.
*
* @param {string} location
* @param {string} prefix
* @returns {Promise<Array<{key: string, size: number}>>}
*/
async listDirectoryStats(location, prefix) {
throw new NotImplementedError('method not implemented in persistor', {
method: 'listDirectoryStats',
location,
prefix,
})
}
}
+40 -1
View File
@@ -196,7 +196,46 @@ module.exports = class FSPersistor extends AbstractPersistor {
async listDirectoryKeys(location, name) {
const fsPath = this._getFsPath(location, name)
return await this._listDirectory(fsPath)
const paths = await this._listDirectory(fsPath)
// Filter to only return files, not directories
const files = []
for (const path of paths) {
try {
const stat = await fsPromises.stat(path)
if (stat.isFile()) {
files.push(path)
}
} catch (err) {
// ignore files that may have just been deleted
if (err.code !== 'ENOENT') {
throw err
}
}
}
return files
}
async listDirectoryStats(location, name) {
const fsPath = this._getFsPath(location, name)
const paths = await this._listDirectory(fsPath)
// Filter to only return files, not directories, with their sizes
const stats = []
for (const path of paths) {
try {
const stat = await fsPromises.stat(path)
if (stat.isFile()) {
stats.push({ key: path, size: stat.size })
}
} catch (err) {
// ignore files that may have just been deleted
if (err.code !== 'ENOENT') {
throw err
}
}
}
return stats
}
async checkIfObjectExists(location, name) {
@@ -322,6 +322,17 @@ module.exports = class GcsPersistor extends AbstractPersistor {
return files.map(file => file.name)
}
async listDirectoryStats(bucketName, prefix) {
const files = await this.#listDirectory(
bucketName,
ensurePrefixIsDirectory(prefix)
)
return files.map(file => ({
key: file.name,
size: parseInt(file.metadata.size, 10),
}))
}
async checkIfObjectExists(bucketName, key) {
try {
const [response] = await this.storage
@@ -417,10 +417,20 @@ class CachedPerProjectEncryptedS3Persistor {
*
* @param {string} bucketName
* @param {string} path
* @return {Promise<ListDirectoryResult>}
* @return {Promise<string[]>}
*/
async listDirectory(bucketName, path) {
return await this.#parent.listDirectory(bucketName, path)
async listDirectoryKeys(bucketName, path) {
return await this.#parent.listDirectoryKeys(bucketName, path)
}
/**
*
* @param {string} bucketName
* @param {string} path
* @return {Promise<Array<{key: string, size: number}>>}
*/
async listDirectoryStats(bucketName, path) {
return await this.#parent.listDirectoryStats(bucketName, path)
}
/**
+55 -2
View File
@@ -266,7 +266,7 @@ class S3Persistor extends AbstractPersistor {
* @return {Promise<void>}
*/
async deleteDirectory(bucketName, key, continuationToken) {
const { contents, response } = await this.listDirectory(
const { contents, response } = await this.#listDirectory(
bucketName,
key,
continuationToken
@@ -309,7 +309,7 @@ class S3Persistor extends AbstractPersistor {
* @param {string} [continuationToken]
* @return {Promise<ListDirectoryResult>}
*/
async listDirectory(bucketName, key, continuationToken) {
async #listDirectory(bucketName, key, continuationToken) {
let response
const options = { Bucket: bucketName, Prefix: key }
if (continuationToken) {
@@ -535,6 +535,59 @@ class S3Persistor extends AbstractPersistor {
}
}
/**
* @param {string} bucketName
* @param {string} prefix
* @return {Promise<Array<string>>}
*/
async listDirectoryKeys(bucketName, prefix) {
const keys = []
let continuationToken
do {
const { contents, response } = await this.#listDirectory(
bucketName,
prefix,
continuationToken
)
keys.push(...contents.map(item => item.Key || ''))
continuationToken = response.IsTruncated
? response.NextContinuationToken
: undefined
} while (continuationToken)
return keys
}
/**
* @param {string} bucketName
* @param {string} prefix
* @return {Promise<Array<{key: string, size: number}>>}
*/
async listDirectoryStats(bucketName, prefix) {
const stats = []
let continuationToken
do {
const { contents, response } = await this.#listDirectory(
bucketName,
prefix,
continuationToken
)
stats.push(
...contents.map(item => ({
key: item.Key || '',
size: item.Size ?? -1,
}))
)
continuationToken = response.IsTruncated
? response.NextContinuationToken
: undefined
} while (continuationToken)
return stats
}
/**
* @param {string} bucket
* @return {S3Client}
@@ -400,6 +400,56 @@ describe('FSPersistorTests', function () {
).to.equal(0)
})
})
describe('listDirectoryKeys', function () {
beforeEach(async function () {
for (const file of Object.values(files)) {
await persistor.sendFile(location, file, '/uploads/info.txt')
}
})
it('should list directory keys', async function () {
const keys = await persistor.listDirectoryKeys(location, 'animals')
expect(keys).to.have.lengthOf(2)
expect(keys).to.include(scenario.fsPath(files.wombat))
expect(keys).to.include(scenario.fsPath(files.giraffe))
})
it('should return empty array for non-existing directories', async function () {
const keys = await persistor.listDirectoryKeys(
location,
'does-not-exist'
)
expect(keys).to.deep.equal([])
})
})
describe('listDirectoryStats', function () {
beforeEach(async function () {
for (const file of Object.values(files)) {
await persistor.sendFile(location, file, '/uploads/info.txt')
}
})
it('should list directory stats', async function () {
const stats = await persistor.listDirectoryStats(location, 'animals')
expect(stats).to.have.lengthOf(2)
const keys = stats.map(s => s.key)
expect(keys).to.include(scenario.fsPath(files.wombat))
expect(keys).to.include(scenario.fsPath(files.giraffe))
for (const stat of stats) {
expect(stat.size).to.equal(localFiles['/uploads/info.txt'].length)
}
})
it('should return empty array for non-existing directories', async function () {
const stats = await persistor.listDirectoryStats(
location,
'does-not-exist'
)
expect(stats).to.deep.equal([])
})
})
})
}
})
@@ -723,4 +723,127 @@ describe('GcsPersistorTests', function () {
})
})
})
describe('listDirectoryKeys', function () {
describe('with valid parameters', function () {
let keys
beforeEach(async function () {
const filesWithNames = files.map((file, i) => ({
...file,
name: i === 0 ? 'llama' : 'hippo',
}))
GcsBucket.getFiles.resolves([filesWithNames])
keys = await GcsPersistor.listDirectoryKeys(bucket, key)
})
it('should list the objects in the directory', function () {
expect(Storage.prototype.bucket).to.have.been.calledWith(bucket)
expect(GcsBucket.getFiles).to.have.been.calledWith({
prefix: `${key}/`,
})
})
it('should return the keys', function () {
expect(keys).to.deep.equal(['llama', 'hippo'])
})
})
describe('when there are no files', function () {
let keys
beforeEach(async function () {
GcsBucket.getFiles.resolves([[]])
keys = await GcsPersistor.listDirectoryKeys(bucket, key)
})
it('should return an empty array', function () {
expect(keys).to.deep.equal([])
})
})
describe('when there is an error listing the objects', function () {
let error
beforeEach(async function () {
GcsBucket.getFiles.rejects(genericError)
try {
await GcsPersistor.listDirectoryKeys(bucket, key)
} catch (err) {
error = err
}
})
it('should generate a ReadError', function () {
expect(error).to.be.an.instanceOf(Errors.ReadError)
})
it('should wrap the error', function () {
expect(error.cause).to.equal(genericError)
})
})
})
describe('listDirectoryStats', function () {
describe('with valid parameters', function () {
let stats
beforeEach(async function () {
const filesWithNames = files.map((file, i) => ({
...file,
name: i === 0 ? 'llama' : 'hippo',
}))
GcsBucket.getFiles.resolves([filesWithNames])
stats = await GcsPersistor.listDirectoryStats(bucket, key)
})
it('should list the objects in the directory', function () {
expect(Storage.prototype.bucket).to.have.been.calledWith(bucket)
expect(GcsBucket.getFiles).to.have.been.calledWith({
prefix: `${key}/`,
})
})
it('should return the stats', function () {
expect(stats).to.deep.equal([
{ key: 'llama', size: 11 },
{ key: 'hippo', size: 22 },
])
})
})
describe('when there are no files', function () {
let stats
beforeEach(async function () {
GcsBucket.getFiles.resolves([[]])
stats = await GcsPersistor.listDirectoryStats(bucket, key)
})
it('should return an empty array', function () {
expect(stats).to.deep.equal([])
})
})
describe('when there is an error listing the objects', function () {
let error
beforeEach(async function () {
GcsBucket.getFiles.rejects(genericError)
try {
await GcsPersistor.listDirectoryStats(bucket, key)
} catch (err) {
error = err
}
})
it('should generate a ReadError', function () {
expect(error).to.be.an.instanceOf(Errors.ReadError)
})
it('should wrap the error', function () {
expect(error.cause).to.equal(genericError)
})
})
})
})
@@ -1046,4 +1046,215 @@ describe('S3PersistorTests', function () {
})
})
})
describe('listDirectoryKeys', function () {
describe('with valid parameters', function () {
let keys
beforeEach(async function () {
S3.mockSend(S3.ListObjectsV2Command, {
Contents: files.map(file => ({ Key: file.Key })),
IsTruncated: false,
})
keys = await S3Persistor.listDirectoryKeys(bucket, key)
})
it('should list the objects in the directory', function () {
S3.assertSendCalledWith(S3.ListObjectsV2Command, {
Bucket: bucket,
Prefix: key,
})
})
it('should return the keys', function () {
expect(keys).to.deep.equal(['llama', 'hippo'])
})
})
describe('when there are no files', function () {
let keys
beforeEach(async function () {
S3.mockSend(S3.ListObjectsV2Command, {
Contents: [],
IsTruncated: false,
})
keys = await S3Persistor.listDirectoryKeys(bucket, key)
})
it('should return an empty array', function () {
expect(keys).to.deep.equal([])
})
})
describe('when there are more files available', function () {
const continuationToken = 'wombat'
let keys
beforeEach(async function () {
S3.mockSend(
S3.ListObjectsV2Command,
{
Contents: [files[0]].map(file => ({ Key: file.Key })),
IsTruncated: true,
NextContinuationToken: continuationToken,
},
{
nextResponses: [
{
Contents: [files[1]].map(file => ({ Key: file.Key })),
IsTruncated: false,
},
],
}
)
keys = await S3Persistor.listDirectoryKeys(bucket, key)
})
it('should list the objects in multiple calls', function () {
S3.assertSendCallCount(S3.ListObjectsV2Command, 2)
expect(S3.s3ClientStub.send.firstCall.args[0].payload).to.deep.equal({
Bucket: bucket,
Prefix: key,
})
expect(S3.s3ClientStub.send.secondCall.args[0].payload).to.deep.equal({
Bucket: bucket,
Prefix: key,
ContinuationToken: continuationToken,
})
})
it('should return all keys', function () {
expect(keys).to.deep.equal(['llama', 'hippo'])
})
})
describe('when there is an error listing the objects', function () {
let error
beforeEach(async function () {
S3.mockSend(S3.ListObjectsV2Command, genericError, { rejects: true })
try {
await S3Persistor.listDirectoryKeys(bucket, key)
} catch (err) {
error = err
}
})
it('should generate a ReadError', function () {
expect(error).to.be.an.instanceOf(Errors.ReadError)
})
it('should wrap the error', function () {
expect(error.cause).to.exist
})
})
})
describe('listDirectoryStats', function () {
describe('with valid parameters', function () {
let stats
beforeEach(async function () {
S3.mockSend(S3.ListObjectsV2Command, {
Contents: files.map(file => ({ Key: file.Key, Size: file.Size })),
IsTruncated: false,
})
stats = await S3Persistor.listDirectoryStats(bucket, key)
})
it('should list the objects in the directory', function () {
S3.assertSendCalledWith(S3.ListObjectsV2Command, {
Bucket: bucket,
Prefix: key,
})
})
it('should return the stats', function () {
expect(stats).to.deep.equal([
{ key: 'llama', size: 11 },
{ key: 'hippo', size: 22 },
])
})
})
describe('when there are no files', function () {
let stats
beforeEach(async function () {
S3.mockSend(S3.ListObjectsV2Command, {
Contents: [],
IsTruncated: false,
})
stats = await S3Persistor.listDirectoryStats(bucket, key)
})
it('should return an empty array', function () {
expect(stats).to.deep.equal([])
})
})
describe('when there are more files available', function () {
const continuationToken = 'wombat'
let stats
beforeEach(async function () {
S3.mockSend(
S3.ListObjectsV2Command,
{
Contents: [files[0]].map(file => ({
Key: file.Key,
Size: file.Size,
})),
IsTruncated: true,
NextContinuationToken: continuationToken,
},
{
nextResponses: [
{
Contents: [files[1]].map(file => ({
Key: file.Key,
Size: file.Size,
})),
IsTruncated: false,
},
],
}
)
stats = await S3Persistor.listDirectoryStats(bucket, key)
})
it('should list the objects in multiple calls', function () {
S3.assertSendCallCount(S3.ListObjectsV2Command, 2)
})
it('should return all stats', function () {
expect(stats).to.deep.equal([
{ key: 'llama', size: 11 },
{ key: 'hippo', size: 22 },
])
})
})
describe('when there is an error listing the objects', function () {
let error
beforeEach(async function () {
S3.mockSend(S3.ListObjectsV2Command, genericError, { rejects: true })
try {
await S3Persistor.listDirectoryStats(bucket, key)
} catch (err) {
error = err
}
})
it('should generate a ReadError', function () {
expect(error).to.be.an.instanceOf(Errors.ReadError)
})
it('should wrap the error', function () {
expect(error.cause).to.exist
})
})
})
})
@@ -400,28 +400,22 @@ export async function archiveLatestChunk(
* @return {Promise<void>}
*/
async function addRawBlobsToArchive(historyId, archive, projectCache) {
const key = projectKey.format(historyId)
const { contents } = await projectCache.listDirectory(projectBlobsBucket, key)
for (const blobRecord of contents) {
if (!blobRecord.Key) {
logger.debug({ blobRecord }, 'no key')
continue
}
const blobKey = blobRecord.Key
const blobKeys = await projectCache.listDirectoryKeys(
projectBlobsBucket,
projectKey.format(historyId)
)
for (const key of blobKeys) {
try {
const stream = await projectCache.getObjectStream(
projectBlobsBucket,
blobKey,
key,
{ autoGunzip: true }
)
archive.append(stream, {
name: path.join(historyId, 'blobs', blobKey),
name: path.join(historyId, 'blobs', key),
})
} catch (err) {
logger.warn(
{ err, path: blobRecord.Key },
'Failed to append blob to archive'
)
logger.warn({ err, path: key }, 'Failed to append blob to archive')
}
}
}
@@ -445,24 +439,20 @@ export async function archiveRawProject(
) {
const projectCache = await getProjectPersistor(historyId)
const { contents: chunks } = await projectCache.listDirectory(
const chunkKeys = await projectCache.listDirectoryKeys(
chunksBucket,
projectKey.format(historyId)
)
if (chunks.length === 0) {
if (chunkKeys.length === 0) {
throw new Error('No chunks found')
}
for (const chunkRecord of chunks) {
if (!chunkRecord.Key) {
logger.debug({ chunkRecord }, 'no key')
continue
}
const chunkId = chunkRecord.Key.split('/').pop()
logger.debug({ chunkId, key: chunkRecord.Key }, 'Processing chunk')
for (const key of chunkKeys) {
const chunkId = key.split('/').pop()
logger.debug({ chunkId, key }, 'Processing chunk')
const { buffer } = await loadChunkByKey(projectCache, chunkRecord.Key)
const { buffer } = await loadChunkByKey(projectCache, key)
archive.append(buffer, {
name: `${historyId}/chunks/${chunkId}/chunk.json`,
+16 -10
View File
@@ -900,6 +900,11 @@ class BlobComparator {
const SHA1_HEX_REGEX = /^[a-f0-9]{40}$/
/**
* Get a listing of all blobs for a project
* @param {string} historyId - The history ID
* @returns {Promise<Map<string, {key: string, size: number}>>} Map of blob hash to blob metadata
*/
async function getBlobListing(historyId) {
const backupPersistorForProject = await backupPersistor.forProject(
projectBlobsBucket,
@@ -909,7 +914,7 @@ async function getBlobListing(historyId) {
// get the blob listing
const projectBlobsPath = projectKey.format(historyId)
const { contents: blobList } = await backupPersistorForProject.listDirectory(
const blobList = await backupPersistorForProject.listDirectoryStats(
projectBlobsBucket,
projectBlobsPath
)
@@ -918,21 +923,22 @@ async function getBlobListing(historyId) {
return new Map()
}
/** @type {Map<string, {key: string, size: number}>} */
const remoteBlobs = new Map()
for (const blobRecord of blobList) {
if (!blobRecord.Key) {
logger.debug({ blobRecord }, 'no key')
if (!blobRecord.key || typeof blobRecord.size !== 'number') {
logger.debug({ blobRecord }, 'invalid blob record')
continue
}
const parts = blobRecord.Key.split('/')
const parts = blobRecord.key.split('/')
const hash = parts[3] + parts[4]
if (!SHA1_HEX_REGEX.test(hash)) {
console.warn(`Invalid SHA1 hash for project ${historyId}: ${hash}`)
continue
}
remoteBlobs.set(hash, blobRecord)
remoteBlobs.set(hash, { key: blobRecord.key, size: blobRecord.size })
}
return remoteBlobs
}
@@ -1075,26 +1081,26 @@ async function compareBackups(projectId, options, log = console.log) {
const blobListEntry = blobsFromListing.get(blob.hash)
if (options.fast) {
if (blobListEntry) {
if (blob.byteLength === blobListEntry.Size) {
if (blob.byteLength === blobListEntry.size) {
// Size matches exactly
log(
` ✓ Blob ${blob.hash} exists on remote with expected size (${blob.byteLength} bytes)`
)
totalBlobMatches++
continue
} else if (blob.stringLength > 0 && blobListEntry.Size > 0) {
} else if (blob.stringLength > 0 && blobListEntry.size > 0) {
// Text file present with compressed size, assume valid as we are in --fast comparison mode
const compressionRatio = (
blobListEntry.Size / blob.byteLength
blobListEntry.size / blob.byteLength
).toFixed(2)
log(
` ✓ Blob ${blob.hash} consistent with compressed data on remote (${blob.byteLength} bytes => ${blobListEntry.Size} bytes, ratio=${compressionRatio})`
` ✓ Blob ${blob.hash} consistent with compressed data on remote (${blob.byteLength} bytes => ${blobListEntry.size} bytes, ratio=${compressionRatio})`
)
totalBlobMatches++
continue
} else {
log(
` ✗ Blob ${blob.hash} size mismatch (original: ${blob.byteLength} bytes, stringLength: ${blob.stringLength}, backup: ${blobListEntry.Size} bytes)`
` ✗ Blob ${blob.hash} size mismatch (original: ${blob.byteLength} bytes, stringLength: ${blob.stringLength}, backup: ${blobListEntry.size} bytes)`
)
totalBlobMismatches++
errors.push({