updated API Manager to call aspell in batches (#30)

This commit is contained in:
Miguel Serrano
2019-07-11 12:29:00 +02:00
committed by GitHub
parent a28ecc0c39
commit 74c9a3c095
9 changed files with 272 additions and 1148 deletions
+11 -3
View File
@@ -7,13 +7,13 @@
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let ASpell
const ASpellWorkerPool = require('./ASpellWorkerPool')
const LRU = require('lru-cache')
const logger = require('logger-sharelatex')
const fs = require('fs')
const settings = require('settings-sharelatex')
const Path = require('path')
const { promisify } = require('util')
const OneMinute = 60 * 1000
const opts = { max: 10000, maxAge: OneMinute * 60 * 10 }
@@ -164,7 +164,7 @@ class ASpellRunner {
}
}
module.exports = ASpell = {
const ASpell = {
// The description of how to call aspell from another program can be found here:
// http://aspell.net/man-html/Through-A-Pipe.html
checkWords(language, words, callback) {
@@ -174,7 +174,15 @@ module.exports = ASpell = {
const runner = new ASpellRunner()
return runner.checkWords(language, words, callback)
},
ASPELL_TIMEOUT: 6000
ASPELL_TIMEOUT: 10000
}
const promises = {
checkWords: promisify(ASpell.checkWords)
}
ASpell.promises = promises
module.exports = ASpell
var WorkerPool = new ASpellWorkerPool()
+5 -5
View File
@@ -11,7 +11,7 @@ const childProcess = require('child_process')
const logger = require('logger-sharelatex')
const metrics = require('metrics-sharelatex')
const _ = require('underscore')
const errorType = require('overleaf-error-type')
const OError = require('@overleaf/o-error')
const BATCH_SIZE = 100
@@ -49,7 +49,7 @@ class ASpellWorker {
this.state = 'closed'
}
if (this.callback != null) {
const err = new errorType.Error({
const err = new OError({
message: 'aspell worker closed output streams with uncalled callback',
info: {
process: this.pipe.pid,
@@ -82,7 +82,7 @@ class ASpellWorker {
if (this.callback != null) {
this.callback(
new errorType.Error({
new OError({
message: 'aspell worker error',
info: errInfo
}).withCause(err),
@@ -110,7 +110,7 @@ class ASpellWorker {
if (this.callback != null) {
this.callback(
new errorType.Error({
new OError({
message: 'aspell worker error on stdin',
info: errInfo
}).withCause(err),
@@ -174,7 +174,7 @@ class ASpellWorker {
if (this.callback != null) {
// only allow one callback in use
return this.callback(
new errorType.Error({
new OError({
message: 'Aspell callback already in use - SHOULD NOT HAPPEN',
info: {
process: this.pipe.pid,
@@ -1,12 +1,3 @@
// TODO: This file was created by bulk-decaffeinate.
// Sanity-check the conversion and remove this comment.
/*
* decaffeinate suggestions:
* DS102: Remove unnecessary code created because of implicit returns
* DS103: Rewrite code to no longer use __guard__
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
const request = require('request')
const logger = require('logger-sharelatex')
const settings = require('settings-sharelatex')
@@ -25,26 +16,21 @@ module.exports = {
if (err != null) {
return res.sendStatus(500)
}
const numberOfSuggestions = __guard__(
__guard__(
__guard__(body != null ? body.misspellings : undefined, x2 => x2[0]),
x1 => x1.suggestions
),
x => x.length
)
const misspellings =
body && body.misspellings ? body.misspellings[0] : undefined
const numberOfSuggestions =
misspellings && misspellings.suggestions
? misspellings.suggestions.length
: 0
if (numberOfSuggestions > 10) {
logger.log('health check passed')
return res.sendStatus(200)
res.sendStatus(200)
} else {
logger.err({ body, numberOfSuggestions }, 'health check failed')
return res.sendStatus(500)
res.sendStatus(500)
}
})
}
}
function __guard__(value, transform) {
return typeof value !== 'undefined' && value !== null
? transform(value)
: undefined
}
+17 -13
View File
@@ -1,18 +1,10 @@
// TODO: This file was created by bulk-decaffeinate.
// Sanity-check the conversion and remove this comment.
/*
* decaffeinate suggestions:
* DS102: Remove unnecessary code created because of implicit returns
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let LearnedWordsManager
const db = require('./DB')
const mongoCache = require('./MongoCache')
const logger = require('logger-sharelatex')
const metrics = require('metrics-sharelatex')
const { promisify } = require('util')
module.exports = LearnedWordsManager = {
const LearnedWordsManager = {
learnWord(userToken, word, callback) {
if (callback == null) {
callback = () => {}
@@ -45,7 +37,7 @@ module.exports = LearnedWordsManager = {
metrics.inc('mongoCache', 0.1, { status: 'miss' })
logger.info({ userToken }, 'mongoCache miss')
return db.spellingPreferences.findOne({ token: userToken }, function(
db.spellingPreferences.findOne({ token: userToken }, function(
error,
preferences
) {
@@ -55,7 +47,7 @@ module.exports = LearnedWordsManager = {
const words =
(preferences != null ? preferences.learnedWords : undefined) || []
mongoCache.set(userToken, words)
return callback(null, words)
callback(null, words)
})
},
@@ -63,9 +55,21 @@ module.exports = LearnedWordsManager = {
if (callback == null) {
callback = () => {}
}
return db.spellingPreferences.remove({ token: userToken }, callback)
db.spellingPreferences.remove({ token: userToken }, callback)
}
}
const promises = {
learnWord: promisify(LearnedWordsManager.learnWord),
getLearnedWords: promisify(LearnedWordsManager.getLearnedWords),
deleteUsersLearnedWords: promisify(
LearnedWordsManager.deleteUsersLearnedWords
)
}
LearnedWordsManager.promises = promises
module.exports = LearnedWordsManager
;['learnWord', 'getLearnedWords'].map(method =>
metrics.timeAsyncMethod(
LearnedWordsManager,
@@ -1,111 +1,78 @@
// TODO: This file was created by bulk-decaffeinate.
// Sanity-check the conversion and remove this comment.
/*
* decaffeinate suggestions:
* DS102: Remove unnecessary code created because of implicit returns
* DS103: Rewrite code to no longer use __guard__
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
const SpellingAPIManager = require('./SpellingAPIManager')
const logger = require('logger-sharelatex')
const metrics = require('metrics-sharelatex')
function extractCheckRequestData(req) {
const token = req.params ? req.params.user_id : undefined
const wordCount =
req.body && req.body.words ? req.body.words.length : undefined
return { token, wordCount }
}
function extractLearnRequestData(req) {
const token = req.params ? req.params.user_id : undefined
const word = req.body ? req.body.word : undefined
return { token, word }
}
module.exports = {
check(req, res, next) {
check(req, res) {
metrics.inc('spelling-check', 0.1)
logger.info(
{
token: __guard__(req != null ? req.params : undefined, x => x.user_id),
word_count: __guard__(
__guard__(req != null ? req.body : undefined, x2 => x2.words),
x1 => x1.length
)
},
'running check'
)
return SpellingAPIManager.runRequest(req.params.user_id, req.body, function(
error,
result
) {
const { token, wordCount } = extractCheckRequestData(req)
logger.info({ token, wordCount }, 'running check')
SpellingAPIManager.runRequest(token, req.body, function(error, result) {
if (error != null) {
logger.err(
{
err: error,
user_id: __guard__(
req != null ? req.params : undefined,
x3 => x3.user_id
),
word_count: __guard__(
__guard__(req != null ? req.body : undefined, x5 => x5.words),
x4 => x4.length
)
user_id: token,
wordCount
},
'error processing spelling request'
)
return res.sendStatus(500)
}
return res.send(result)
res.send(result)
})
},
learn(req, res, next) {
metrics.inc('spelling-learn', 0.1)
logger.info(
{
token: __guard__(req != null ? req.params : undefined, x => x.user_id),
word: __guard__(req != null ? req.body : undefined, x1 => x1.word)
},
'learning word'
)
return SpellingAPIManager.learnWord(req.params.user_id, req.body, function(
error,
result
) {
const { token, word } = extractLearnRequestData(req)
logger.info({ token, word }, 'learning word')
SpellingAPIManager.learnWord(token, req.body, function(error) {
if (error != null) {
return next(error)
}
res.sendStatus(200)
return next()
next()
})
},
deleteDic(req, res, next) {
logger.log(
{
token: __guard__(req != null ? req.params : undefined, x => x.user_id),
word: __guard__(req != null ? req.body : undefined, x1 => x1.word)
},
'deleting user dictionary'
)
return SpellingAPIManager.deleteDic(req.params.user_id, function(error) {
const { token, word } = extractLearnRequestData(req)
logger.log({ token, word }, 'deleting user dictionary')
SpellingAPIManager.deleteDic(token, function(error) {
if (error != null) {
return next(error)
}
return res.sendStatus(204)
res.sendStatus(204)
})
},
getDic(req, res, next) {
const token = req.params ? req.params.user_id : undefined
logger.info(
{
token: __guard__(req != null ? req.params : undefined, x => x.user_id)
token
},
'getting user dictionary'
)
return SpellingAPIManager.getDic(req.params.user_id, function(
error,
words
) {
SpellingAPIManager.getDic(token, function(error, words) {
if (error != null) {
return next(error)
}
return res.send(words)
res.send(words)
})
}
}
function __guard__(value, transform) {
return typeof value !== 'undefined' && value !== null
? transform(value)
: undefined
}
+56 -46
View File
@@ -6,57 +6,20 @@
* DS207: Consider shorter variations of null checks
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
let SpellingAPIManager
const ASpell = require('./ASpell')
const LearnedWordsManager = require('./LearnedWordsManager')
const { callbackify } = require('util')
const _ = require('underscore')
module.exports = SpellingAPIManager = {
// The max number of words checked in a single request
const REQUEST_LIMIT = 10000
// Size of each chunk of words sent to Aspell checker
const CHUNK_SIZE = 1000
const SpellingAPIManager = {
whitelist: ['ShareLaTeX', 'sharelatex', 'LaTeX', 'http', 'https', 'www'],
runRequest(token, request, callback) {
if (callback == null) {
callback = () => {}
}
if (request.words == null) {
return callback(new Error('malformed JSON'))
}
const lang = request.language || 'en'
const check = (words, callback) =>
ASpell.checkWords(lang, words, (error, misspellings) =>
callback(error, { misspellings })
)
const wordsToCheck = request.words || []
if (token != null) {
return LearnedWordsManager.getLearnedWords(token, function(
error,
learnedWords
) {
if (error != null) {
return callback(error)
}
const words = wordsToCheck.slice(0, 10000)
return check(words, function(error, result) {
if (error != null) {
return callback(error)
}
result.misspellings = result.misspellings.filter(function(m) {
const word = words[m.index]
return (
learnedWords.indexOf(word) === -1 &&
SpellingAPIManager.whitelist.indexOf(word) === -1
)
})
return callback(error, result)
})
})
} else {
return check(wordsToCheck, callback)
}
},
learnWord(token, request, callback) {
if (callback == null) {
callback = () => {}
@@ -79,3 +42,50 @@ module.exports = SpellingAPIManager = {
return LearnedWordsManager.getLearnedWords(token, callback)
}
}
const promises = {
async runRequest(token, request) {
if (!request.words) {
throw new Error('malformed JSON')
}
const lang = request.language || 'en'
// only the first 10K words are checked
const wordSlice = request.words.slice(0, REQUEST_LIMIT)
// word list is splitted into chunks and sent to Aspell concurrently
const chunks = _.chunk(wordSlice, CHUNK_SIZE)
const chunkResults = await Promise.all(
chunks.map(chunk => ASpell.promises.checkWords(lang, chunk))
)
// indexes have to be reconstructed for each chunk
chunkResults.forEach((result, chunkIndex) =>
result.forEach(msp => (msp.index += chunkIndex * CHUNK_SIZE))
)
// the final misspellings object is created merging all the chunks
const misspellings = _.flatten(chunkResults)
if (token) {
const learnedWords = await LearnedWordsManager.promises.getLearnedWords(
token
)
const notLearntMisspellings = misspellings.filter(m => {
const word = wordSlice[m.index]
return (
learnedWords.indexOf(word) === -1 &&
SpellingAPIManager.whitelist.indexOf(word) === -1
)
})
return { misspellings: notLearntMisspellings }
} else {
return { misspellings }
}
}
}
SpellingAPIManager.runRequest = callbackify(promises.runRequest)
SpellingAPIManager.promises = promises
module.exports = SpellingAPIManager
+27 -912
View File
File diff suppressed because it is too large Load Diff
+3 -3
View File
@@ -24,6 +24,7 @@
},
"version": "0.1.4",
"dependencies": {
"@overleaf/o-error": "^2.0.0",
"async": "0.1.22",
"body-parser": "^1.12.0",
"coffee-script": "^1.9.1",
@@ -34,10 +35,9 @@
"mongojs": "2.4.0",
"node-statsd": "0.0.3",
"redis": "~0.8.4",
"request": "^2.53.0",
"request": "^2.88.0",
"settings-sharelatex": "^1.1.0",
"underscore": "1.4.4",
"overleaf-error-type": "git+https://github.com/overleaf/overleaf-error-type.git"
"underscore": "1.9.1"
},
"devDependencies": {
"bunyan": "^1.0.0",
@@ -1,14 +1,6 @@
/* eslint-disable
handle-callback-err,
no-undef
handle-callback-err
*/
// TODO: This file was created by bulk-decaffeinate.
// Sanity-check the conversion and remove this comment.
/*
* decaffeinate suggestions:
* DS102: Remove unnecessary code created because of implicit returns
* Full docs: https://github.com/decaffeinate/decaffeinate/blob/master/docs/suggestions.md
*/
const sinon = require('sinon')
const chai = require('chai')
const { expect } = chai
@@ -19,6 +11,8 @@ const modulePath = require('path').join(
'../../../app/js/SpellingAPIManager'
)
const promiseStub = val => new Promise(resolve => resolve(val))
describe('SpellingAPIManager', function() {
beforeEach(function() {
this.token = 'user-id-123'
@@ -26,15 +20,18 @@ describe('SpellingAPIManager', function() {
this.learnedWords = ['lerned']
this.LearnedWordsManager = {
getLearnedWords: sinon.stub().callsArgWith(1, null, this.learnedWords),
learnWord: sinon.stub().callsArg(2)
learnWord: sinon.stub().callsArg(2),
promises: {
getLearnedWords: sinon.stub().returns(promiseStub(this.learnedWords))
}
}
return (this.SpellingAPIManager = SandboxedModule.require(modulePath, {
this.SpellingAPIManager = SandboxedModule.require(modulePath, {
requires: {
'./ASpell': this.ASpell,
'./LearnedWordsManager': this.LearnedWordsManager
}
}))
})
})
describe('runRequest', function() {
@@ -58,25 +55,28 @@ describe('SpellingAPIManager', function() {
this.misspellingsWithoutLearnedWords = this.misspellings.slice(0, 3)
this.ASpell.checkWords = (lang, word, callback) => {
return callback(null, this.misspellings)
callback(null, this.misspellings)
}
return sinon.spy(this.ASpell, 'checkWords')
this.ASpell.promises = {
checkWords: sinon.stub().returns(promiseStub(this.misspellings))
}
sinon.spy(this.ASpell, 'checkWords')
})
describe('with sensible JSON', function() {
beforeEach(function(done) {
return this.SpellingAPIManager.runRequest(
this.SpellingAPIManager.runRequest(
this.token,
{ words: this.allWords },
(error, result) => {
this.result = result
return done()
done()
}
)
})
return it('should return the words that are spelled incorrectly and not learned', function() {
return expect(this.result.misspellings).to.deep.equal(
it('should return the words that are spelled incorrectly and not learned', function() {
expect(this.result.misspellings).to.deep.equal(
this.misspellingsWithoutLearnedWords
)
})
@@ -84,175 +84,209 @@ describe('SpellingAPIManager', function() {
describe('with a missing words array', function() {
beforeEach(function(done) {
return this.SpellingAPIManager.runRequest(
this.token,
{},
(error, result) => {
this.error = error
this.result = result
return done()
}
)
this.SpellingAPIManager.runRequest(this.token, {}, (error, result) => {
this.error = error
this.result = result
done()
})
})
return it('should return an error', function() {
it('should return an error', function() {
expect(this.error).to.exist
expect(this.error).to.be.instanceof(Error)
return expect(this.error.message).to.equal('malformed JSON')
expect(this.error.message).to.equal('malformed JSON')
})
})
describe('with a missing token', function() {
beforeEach(function(done) {
return this.SpellingAPIManager.runRequest(
this.SpellingAPIManager.runRequest(
null,
{ words: this.allWords },
(error, result) => {
this.error = error
this.result = result
return done()
done()
}
)
})
return it('should spell check without using any learned words', function() {
return this.LearnedWordsManager.getLearnedWords.called.should.equal(
false
)
it('should spell check without using any learned words', function() {
this.LearnedWordsManager.getLearnedWords.called.should.equal(false)
})
})
describe('without a language', function() {
beforeEach(function(done) {
return this.SpellingAPIManager.runRequest(
this.SpellingAPIManager.runRequest(
this.token,
{ words: this.allWords },
(error, result) => {
this.result = result
return done()
done()
}
)
})
return it('should use en as the default', function() {
return this.ASpell.checkWords.calledWith('en').should.equal(true)
it('should use en as the default', function() {
this.ASpell.promises.checkWords.calledWith('en').should.equal(true)
})
})
describe('with a language', function() {
beforeEach(function(done) {
return this.SpellingAPIManager.runRequest(
this.language = 'fr'
this.SpellingAPIManager.runRequest(
this.token,
{
words: this.allWords,
language: (this.language = 'fr')
language: this.language
},
(error, result) => {
this.result = result
return done()
done()
}
)
})
return it('should use the language', function() {
return this.ASpell.checkWords
it('should use the language', function() {
this.ASpell.promises.checkWords
.calledWith(this.language)
.should.equal(true)
})
})
describe('with a very large collection of words', function() {
describe('with a large collection of words', function() {
beforeEach(function(done) {
this.manyWords = __range__(1, 100000, true).map(i => 'word')
return this.SpellingAPIManager.runRequest(
this.ASpell.promises.checkWords = sinon.spy(() =>
promiseStub(this.misspellings)
)
this.manyWords = __range__(1, 4500, true).map(i => 'word')
this.SpellingAPIManager.runRequest(
this.token,
{ words: this.manyWords },
(error, result) => {
this.result = result
return done()
done()
}
)
})
return it('should truncate to 10,000 words', function() {
return this.ASpell.checkWords
.calledWith(sinon.match.any, this.manyWords.slice(0, 10000))
.should.equal(true)
it('should make concurrent requests of 1000 words', function() {
this.ASpell.promises.checkWords.callCount.should.equal(5)
})
})
return describe('with words from the whitelist', function() {
describe('with a collection of words with the max size of a chunk', function() {
beforeEach(function(done) {
this.ASpell.promises.checkWords = sinon.spy(() =>
promiseStub(this.misspellings)
)
this.manyWords = __range__(1, 1000, true).map(i => 'word')
this.SpellingAPIManager.runRequest(
this.token,
{ words: this.manyWords },
(error, result) => {
this.result = result
done()
}
)
})
it('should make a single request', function() {
this.ASpell.promises.checkWords.callCount.should.equal(1)
})
})
describe('with a very large collection of words', function() {
beforeEach(function(done) {
this.ASpell.promises.checkWords = sinon.spy(() =>
promiseStub(this.misspellings)
)
this.manyWords = __range__(1, 50000, true).map(i => 'word')
this.SpellingAPIManager.runRequest(
this.token,
{ words: this.manyWords },
(error, result) => {
this.result = result
done()
}
)
})
it('should make a maximum of 10 concurrent requests', function() {
this.ASpell.promises.checkWords.callCount.should.equal(10)
})
})
describe('with words from the whitelist', function() {
beforeEach(function(done) {
this.whitelistWord = this.SpellingAPIManager.whitelist[0]
this.words = ['One', 'Two', this.whitelistWord]
return this.SpellingAPIManager.runRequest(
this.SpellingAPIManager.runRequest(
this.token,
{ words: this.words },
(error, result) => {
this.result = result
return done()
done()
}
)
})
return it('should ignore the white-listed word', function() {
return expect(this.result.misspellings.length).to.equal(
it('should ignore the white-listed word', function() {
expect(this.result.misspellings.length).to.equal(
this.misspellings.length - 1
)
})
})
})
return describe('learnWord', function() {
describe('learnWord', function() {
describe('without a token', function() {
beforeEach(function(done) {
return this.SpellingAPIManager.learnWord(
null,
{ word: 'banana' },
error => {
this.error = error
return done()
}
)
this.SpellingAPIManager.learnWord(null, { word: 'banana' }, error => {
this.error = error
done()
})
})
return it('should return an error', function() {
it('should return an error', function() {
expect(this.error).to.exist
expect(this.error).to.be.instanceof(Error)
return expect(this.error.message).to.equal('no token provided')
expect(this.error.message).to.equal('no token provided')
})
})
describe('without a word', function() {
beforeEach(function(done) {
return this.SpellingAPIManager.learnWord(this.token, {}, error => {
this.SpellingAPIManager.learnWord(this.token, {}, error => {
this.error = error
return done()
done()
})
})
return it('should return an error', function() {
it('should return an error', function() {
expect(this.error).to.exist
expect(this.error).to.be.instanceof(Error)
return expect(this.error.message).to.equal('malformed JSON')
expect(this.error.message).to.equal('malformed JSON')
})
})
return describe('with a word and a token', function() {
describe('with a word and a token', function() {
beforeEach(function(done) {
this.word = 'banana'
return this.SpellingAPIManager.learnWord(
this.SpellingAPIManager.learnWord(
this.token,
{ word: this.word },
error => {
this.error = error
return done()
done()
}
)
})
return it('should call LearnedWordsManager.learnWord', function() {
return this.LearnedWordsManager.learnWord
it('should call LearnedWordsManager.learnWord', function() {
this.LearnedWordsManager.learnWord
.calledWith(this.token, this.word)
.should.equal(true)
})