1
Fork 0
mirror of https://github.com/pds-nest/nest.git synced 2024-11-25 22:44:19 +00:00
pds-2021-g2-nest/nest_frontend/utils/tokenizeTweetWords.js

35 lines
883 B
JavaScript
Raw Normal View History

2021-05-20 09:39:40 +00:00
import sw from "stopword"
const stopwords = [...sw.it, ...sw.en, "rt"]
export default function(tweets = {}) {
let preprocessedWords = {}
for(const tweet of tweets) {
if(!tweet.content) {
continue
}
for(const word of tweet.content.toLowerCase().split(/\s+/)) {
if(stopwords.includes(word)) continue
if(word.startsWith("https://")) continue
if(!preprocessedWords.hasOwnProperty(word)) {
preprocessedWords[word] = 0
}
preprocessedWords[word] += 1
}
}
let processedWords = []
for(const word in preprocessedWords) {
if(!preprocessedWords.hasOwnProperty(word)) {
continue
}
processedWords.push({
text: word,
value: preprocessedWords[word]
})
}
return processedWords
}