Given a list of documents with id and content, return a HashMap with key as the word and value as a list of document ids (an inverted index).
Example
Input:
[
{
"id": 1,
"content": "This is the content of document 1 it is very short"
},
{
"id": 2,
"content": "This is the content of document 2 it is very long bilabial bilabial heheh hahaha ..."
},
]
Output:
{
"This": [1, 2],
"is": [1, 2],
...
}
Code
public Map<String, List<Integer>> invertedIndex(List<Document> docs) {
Map<String, List<Integer>> result = new HashMap<>();
if (docs == null || docs.size() == 0) {
return result;
}
Map<String, Set<Integer>> index = new HashMap<>();
for (Document doc : docs) {
int curId = doc.id;
for (String term : doc.content.split(" ")) {
if (term.length() == 0) continue;
if (!index.containsKey(term)) {
index.put(term, new LinkedHashSet<>());
}
index.get(term).add(curId);
}
}
for (String key : index.keySet()) {
result.put(key, new ArrayList<>(index.get(key)));
}
return result;
}