@inproceedings{a7dfbf079c094b088aeee24ace2041e0,
title = "Spark-gram: Mining frequent N-grams using parallel processing in Spark",
abstract = "Mining sequence patterns in form of n-grams (sequences of words that appear consecutively) from a large text data is one of the fundamental parts in several information retrieval and natural language processing applications. In this work, we present Spark-gram, a method for large scale frequent sequence mining based on Spark that was adapted from its equivalent method in MapReduce called Suffix-σ. Spark-gram design allows the discovery of all n-grams with maximum length σ and minimum occurrence frequency τ, using iterative algorithm with only a single shuffle phase. We show that Spark-gram can outperform Suffix-σ mainly when τ is high but potentially worse when the value of σ grows higher.",
keywords = "distributed computing, hadoop, mapreduce, spark, Text mining",
author = "Utama, {Prasetya Ajie} and Bayu Distiawan",
note = "Publisher Copyright: {\textcopyright} 2015 IEEE.; International Conference on Advanced Computer Science and Information Systems, ICACSIS 2015 ; Conference date: 10-10-2015 Through 11-10-2015",
year = "2016",
month = feb,
day = "19",
doi = "10.1109/ICACSIS.2015.7415169",
language = "English",
series = "ICACSIS 2015 - 2015 International Conference on Advanced Computer Science and Information Systems, Proceedings",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "129--136",
booktitle = "ICACSIS 2015 - 2015 International Conference on Advanced Computer Science and Information Systems, Proceedings",
address = "United States",
}