@inproceedings{cef7be8f0b8946f88f70d8431b3a5df0,
title = "Webpage duplicate detection using combined POS and sequence alignment algorithm",
abstract = "Combined syntactical categories and sequence alignment algorithms are implemented and used to weed-out duplicate and near-duplicate web-pages from search engine results. The syntactical structures manifested as POS-tags were pre-processed using a POS tagger converting parts of a webpage's text into a string of tags. The produced string was then subjected into the longest Common Sequence (LCS) techniques (as is commonly done in computational biology), to detect duplicate and near-duplicate webpages. The process of tagging and aligning was based on set of sentences extracted from the web page as a representative of the pages. The query-keywords are used as a basis for sentence extraction. Results obtained from experiments performed have shown that such a combined approach can provide very interesting similarity calculation and re-ranking measure. This can be used with reasonable efficiency to detect duplications on search results generated by search engines such as Google. Similarity measurements obtained can be further used as a basis for text analysis of the search results allowing the detection of duplicate and near duplicates and clustering of documents in general.",
keywords = "Copy detection, Duplicate, LCS, Longest common sequence, POS, Part-of-speech, Search engine",
author = "Mohamed Elhadi and Amjad Al-Tobi",
year = "2009",
doi = "10.1109/CSIE.2009.771",
language = "English",
isbn = "9780769535074",
series = "2009 WRI World Congress on Computer Science and Information Engineering, CSIE 2009",
pages = "630--634",
booktitle = "2009 WRI World Congress on Computer Science and Information Engineering, CSIE 2009",
note = "2009 WRI World Congress on Computer Science and Information Engineering, CSIE 2009 ; Conference date: 31-03-2009 Through 02-04-2009",
}