@inproceedings{7883, keywords = {Archiving datum, cultural heritage, Cultural heritage preservation, Cultural heritages, Daily lives, Data cleaning, Data collecting, Digital storage, Heritage preservation, Historic preservation, Query processing, Social media, Social media networks, Social networking (online), Web Scraping, Web scrapings}, author = {Shaimaa Rashid and Rawaa Qasha}, title = {Extracting and Archiving Data from Social Media to Support Cultural Heritage Preservation in Nineveh}, abstract = {During the last decades, various aspects of Nineveh s cultural heritage have been destroyed during wars or natural causes. Therefore, the needs to preserve these valuable heritages become crucial. With the increased use of the Internet, Social media networks have become part of peoples daily lives for publicly sharing information, including their feelings, opinion expression, knowledge, and sharing images, videos, audio, and even their locations. This paper aims to gather Nineveh s cultural heritage data from different social media sites. We prepare it to be used for supporting the preservation of the cultural heritage process, including both tangible and intangible heritage. With social media data, python programming language, and web scraping, various data types can be fetched from different heterogeneous sources such as Twitter, YouTube, etc., depending on several keywords and hashtags. Once the data is collected, several pre-processing operations are implemented to clean, organize and archive the resulted data in the NoSQL database and Amazon Simple Storage Service (Amazon S3). The archived cleaned information can be used later to query, browse, analyze and visualize the target information.}, booktitle = {Proc. Int. Conf. Comput. Sci. Softw. Eng., CSASE}, pages = {295-300}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, school = {Institute of Electrical and Electronics Engineers Inc.}, isbn = {9781665426329 (ISBN)}, url = {https://www.scopus.com/inward/record.uri?eid=2-s2.0-85129952423&doi=10.1109%2fCSASE51777.2022.9759782&partnerID=40&md5=878387f6d3a7d39f809819c9997e14ea}, doi = {10.1109/CSASE51777.2022.9759782}, note = {Journal Abbreviation: Proc. Int. Conf. Comput. Sci. Softw. Eng., CSASE}, }