From f9430470cbc3386b3490c370203c833c0af899ac Mon Sep 17 00:00:00 2001 From: "g.minoccari" Date: Fri, 28 May 2021 11:48:58 +0200 Subject: [PATCH] Refactor della funzione per associare i tweet alle condizioni che ne hanno scaturito il download --- nest_crawler/associate_condition_tweet.py | 52 ++++++++++++++++++++++ nest_crawler/repo_search.py | 53 +++++++++-------------- 2 files changed, 72 insertions(+), 33 deletions(-) create mode 100644 nest_crawler/associate_condition_tweet.py diff --git a/nest_crawler/associate_condition_tweet.py b/nest_crawler/associate_condition_tweet.py new file mode 100644 index 0000000..f6bfce5 --- /dev/null +++ b/nest_crawler/associate_condition_tweet.py @@ -0,0 +1,52 @@ +from datetime import datetime +from math import cos, radians + +from nest_backend.database import * + + +def associate_condition_tweet(conditions_type, tweet): + if ConditionType.hashtag in conditions_type.keys(): + for condition_content in conditions_type[ConditionType.hashtag]: + if condition_content.content in [hashtag['text'] for hashtag in tweet.entities['hashtags']]: + if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): + condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) + ext.session.add(condition_associated) + ext.session.commit() + if ConditionType.user in conditions_type.keys(): + for condition_content in conditions_type[ConditionType.user]: + if condition_content.content == tweet.author.screen_name: + if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): + condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) + ext.session.add(condition_associated) + ext.session.commit() + if ConditionType.time in conditions_type.keys(): + for condition_content in conditions_type[ConditionType.time]: + condition_date_time = datetime.fromisoformat(condition_content.content[2:]) + if condition_content.content[0] == '<': + if tweet.created_at < condition_date_time: + if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): + condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) + ext.session.add(condition_associated) + ext.session.commit() + elif condition_content.content[0] == '>': + if tweet.created_at > condition_date_time: + if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): + condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) + ext.session.add(condition_associated) + ext.session.commit() + if ConditionType.coordinates in conditions_type.keys(): + for condition_content in conditions_type[ConditionType.coordinates]: + coordinates = condition_content.content.split() + if tweet.geo is not None and is_coordinate_inside_bounding_box(float(coordinates[2]), float(coordinates[3]), float(coordinates[1])/1000, tweet.geo['coordinates'][0], tweet.geo['coordinates'][1]): + if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): + condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) + ext.session.add(condition_associated) + ext.session.commit() + + +def is_coordinate_inside_bounding_box(latitude, longitude, radius, tweet_latitude, tweet_longitude): + earth_radius_km = 6371 + dLatitude = 360 * radius / earth_radius_km + dLongitude = dLatitude * cos(radians(latitude)) + if (latitude - dLatitude < tweet_latitude < latitude+dLatitude) and (longitude-dLongitude < tweet_longitude < longitude+dLongitude): + return True diff --git a/nest_crawler/repo_search.py b/nest_crawler/repo_search.py index 2daa636..d796ea7 100644 --- a/nest_crawler/repo_search.py +++ b/nest_crawler/repo_search.py @@ -2,10 +2,10 @@ from nest_backend.database import * from authentication import authenticate from datetime import datetime, timedelta import tweepy as tw +from associate_condition_tweet import associate_condition_tweet def search_repo_conditions(repository_id): api = authenticate() - geocode = "44.3591600,11.7132000,20km" repo = Repository.query.filter_by(id=repository_id).first() if repo is None: print("Non esiste una repository con questo id") @@ -15,9 +15,9 @@ def search_repo_conditions(repository_id): return False evaluation_mode = repo.evaluation_mode conditions_type = dict() + # Dividing condition into condition types for condition in conditions: - # print(condition.id) if condition.type not in conditions_type.keys(): conditions_type[condition.type] = [condition] else: @@ -32,9 +32,12 @@ def search_repo_conditions(repository_id): for types in conditions_type.keys(): print(types, ":", conditions_type[types]) coordinates_string = "" + # Adding to the query string the hashtag conditions if ConditionType.hashtag in conditions_type.keys(): for condition_content in conditions_type[ConditionType.hashtag]: queryString += ("#" + condition_content.content + " " + queryConjunction + " ") + + # Adding to the coordinates string the coordinates condition if ConditionType.coordinates in conditions_type.keys(): if evaluation_mode == ConditionMode.all_and: if len(conditions_type[ConditionType.coordinates]) == 1: @@ -56,7 +59,6 @@ def search_repo_conditions(repository_id): image_url_list = image_url_list[:-1] else: image_url_list = None - tweetDB = Tweet(snowflake=tweet.id, content=tweet.text, location=tweet.geo['coordinates'] if tweet.geo is not None else None, place=tweet.place.full_name if tweet.place is not None else None, @@ -72,17 +74,21 @@ def search_repo_conditions(repository_id): composed = Composed(rid=repository_id, snowflake=tweet.id) ext.session.add(composed) ext.session.commit() + # Adding to the query string the user condition if ConditionType.user in conditions_type.keys(): for condition_content in conditions_type[ConditionType.user]: queryString += ("from:" + condition_content.content + " " + queryConjunction + " ") + # Adding to the query string the time condition if ConditionType.time in conditions_type.keys(): for condition_content in conditions_type[ConditionType.time]: if condition_content.content[0] == '<': queryString += ("until:" + condition_content.content[2:] + " " + queryConjunction + " ") elif condition_content.content[0] == '>': queryString += ("since:" + condition_content.content[2:] + " " + queryConjunction + " ") + # End of query string queryString = queryString[:-len(queryConjunction) - 1] print(queryString) + if evaluation_mode == ConditionMode.all_or: if queryString != "": for tweet in tw.Cursor(method=api.search, q=queryString).items(10): @@ -109,42 +115,23 @@ def search_repo_conditions(repository_id): ext.session.add(tweetDB) ext.session.commit() if evaluation_mode == ConditionMode.all_or: - if ConditionType.hashtag in conditions_type.keys(): - for condition_content in conditions_type[ConditionType.hashtag]: - if condition_content.content in [hashtag['text'] for hashtag in tweet.entities['hashtags']]: - if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): - condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) - ext.session.add(condition_associated) - ext.session.commit() - if ConditionType.user in conditions_type.keys(): - for condition_content in conditions_type[ConditionType.user]: - if condition_content.content == tweet.author.screen_name: - if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): - condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) - ext.session.add(condition_associated) - ext.session.commit() - - if ConditionType.time in conditions_type.keys(): - for condition_content in conditions_type[ConditionType.time]: - condition_date_time = datetime.fromisoformat(condition_content.content[2:]) - if condition_content.content[0] == '<': - if tweet.created_at < condition_date_time: - if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): - condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) - ext.session.add(condition_associated) - ext.session.commit() - elif condition_content.content[0] == '>': - if tweet.created_at > condition_date_time: - if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition_content.id).all(): - condition_associated = Contains(cid=condition_content.id, snowflake=tweet.id) - ext.session.add(condition_associated) - ext.session.commit() + associate_condition_tweet(conditions_type, tweet) elif evaluation_mode == ConditionMode.all_and: for condition in conditions: if not Contains.query.filter_by(snowflake=str(tweet.id), cid=condition.id).all(): condition_associated = Contains(cid=condition.id, snowflake=tweet.id) ext.session.add(condition_associated) ext.session.commit() + alerts = [alert for alert in repo.alerts] + for alert in alerts: + alert_conditions = [condition.condition for condition in alert.conditions] + alert_conditions_type = dict() + for condition in alert_conditions: + if condition.type not in alert_conditions_type.keys(): + alert_conditions_type[condition.type] = [condition] + else: + alert_conditions_type[condition.type].append(condition) + associate_condition_tweet(alert_conditions_type, tweet) if not Composed.query.filter_by(snowflake=str(tweet.id), rid=repository_id).all(): composed = Composed(rid=repository_id, snowflake=tweet.id) ext.session.add(composed)