diff --git a/Hackathon_model.png b/Hackathon_model.png new file mode 100644 index 00000000..7d4cda1c Binary files /dev/null and b/Hackathon_model.png differ diff --git a/Idea Submission & Presentation 1-0-1.pdf b/Idea Submission & Presentation 1-0-1.pdf new file mode 100644 index 00000000..d77146d7 Binary files /dev/null and b/Idea Submission & Presentation 1-0-1.pdf differ diff --git a/README.md b/README.md index 578bb9cc..38b43803 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # ksp-submission This repository is created for Karnataka State Police Hackathon 2023 - submission collection. ## Team Information -### Team Name - -### Problem Statement - +### Team Name - EData2 +### Problem Statement - Solution for Crowd Sourcing of Records diff --git a/captcha.py b/captcha.py new file mode 100644 index 00000000..618362bc --- /dev/null +++ b/captcha.py @@ -0,0 +1,11 @@ +from PIL import Image +import pytesseract +import cv2 +import os + + +def get_captcha(imag_path): + + text = pytesseract.image_to_string(Image.open(imag_path)) + # os.remove(imag_path) + return text.replace("\n\f","").strip() diff --git a/example.png b/example.png new file mode 100644 index 00000000..764c45d4 Binary files /dev/null and b/example.png differ diff --git a/fb.py b/fb.py new file mode 100644 index 00000000..cff554e8 --- /dev/null +++ b/fb.py @@ -0,0 +1,86 @@ +import requests + +# Replace ACCESS_TOKEN with your Facebook access token +ACCESS_TOKEN = 'EAATQZBQ0Xw2UBADgE1rEbVplG4jhzxqrgGktMnq5PSJSjUicx28h7K1foHcRoqI3H7Wnu7eUg6ZCELPfvZBMbodD5QTXtKKzLUzAe75H9vboyb2jZAv1UUrB1GxqW7UQTsogm31zKWiIEimYOMoXEiyOJgAxupXl8u6PFoXE7kkBRD4Ou7NtGvXBSGcv3ugAuUKvZBW45bghrYd4ZARpYw1pkZC4JWOKZCxdHtuSodWohHKtdsPEIiYFVli5B1fSHgwZD' + +# # Define the endpoint you want to access +endpoint = 'https://graph.facebook.com/v9.0/me?fields=id,name&access_token=' + ACCESS_TOKEN + +# Make a GET request to the endpoint +response = requests.get(endpoint) + +# Check if the request was successful +if response.status_code == 200: + # If successful, extract the data from the response + data = response.json() + print(data) +else: + # If the request failed, print an error message + print('Request failed with status code:', response.status_code) + + + +# Replace USER_ID with the Facebook ID of the user you want to retrieve data for +user_id = data["id"] +user_id = '100086603361203' + +# Define the endpoint URL +url = f"https://graph.facebook.com/v12.0/{user_id}?fields=name,picture&access_token={ACCESS_TOKEN}" + +# Send a GET request to the endpoint +response = requests.get(url) + +# Check if the request was successful +if response.status_code == 200: + # Parse the response JSON + user_data = response.json() + + # Extract the name and picture URL from the response + name = user_data["name"] + picture_url = user_data["picture"]["data"]["url"] + + # Print the name and picture URL + print("Name:", name) + print("Picture URL:", picture_url) +else: + # If the request was unsuccessful, print the error message + print("Failed to retrieve data. Response code:", response.status_code) + +# import requests + +# # Replace ACCESS_TOKEN with your Facebook access token +# ACCESS_TOKEN = 'your-access-token' + +# Replace EMAIL with the email address of the Facebook user you want to search for +# EMAIL = 'rahul.v@elintdata.com' + +# # Define the endpoint for the Facebook Graph API +# endpoint = f'https://graph.facebook.com/v9.0/search?q={EMAIL}&type=user&fields=id,name&access_token={ACCESS_TOKEN}' + +# # Make a GET request to the endpoint +# response = requests.get(endpoint) +# print(response) +# # Check if the request was successful +# if response.status_code == 200: +# # If successful, extract the data from the response +# data = response.json() +# users = data['data'] +# if len(users) > 0: +# user = users[0] +# user_id = user['id'] +# user_name = user['name'] +# print('User ID:', user_id) +# print('User Name:', user_name) +# else: +# print('No user found with that email address') +# else: +# # If the request failed, print an error message +# print('Request failed with status code:', response.status_code) +# if response.status_code == 400: +# # If so, extract the error information from the response +# error = response.json() +# error_message = error['error']['message'] +# error_code = error['error']['code'] +# print('Error Message:', error_message) +# print('Error Code:', error_code) + diff --git a/final_ppt.pptx b/final_ppt.pptx new file mode 100644 index 00000000..f359813d Binary files /dev/null and b/final_ppt.pptx differ diff --git a/get_fbid.py b/get_fbid.py new file mode 100644 index 00000000..fee3d77e --- /dev/null +++ b/get_fbid.py @@ -0,0 +1,33 @@ +import requests + +# Replace ACCESS_TOKEN with your access token +ACCESS_TOKEN = 'EAATQZBQ0Xw2UBADgE1rEbVplG4jhzxqrgGktMnq5PSJSjUicx28h7K1foHcRoqI3H7Wnu7eUg6ZCELPfvZBMbodD5QTXtKKzLUzAe75H9vboyb2jZAv1UUrB1GxqW7UQTsogm31zKWiIEimYOMoXEiyOJgAxupXl8u6PFoXE7kkBRD4Ou7NtGvXBSGcv3ugAuUKvZBW45bghrYd4ZARpYw1pkZC4JWOKZCxdHtuSodWohHKtdsPEIiYFVli5B1fSHgwZD' + +# Replace EMAIL_ADDRESS with the email address of the user you want to retrieve the Facebook ID for +email_address = "rahul.v@elintdata.com" + +# Define the endpoint URL +url = f"https://graph.facebook.com/search?q={email_address}&type=user&access_token={ACCESS_TOKEN}" + +# Send a GET request to the endpoint +response = requests.get(url) + +# Check if the request was successful +if response.status_code == 200: + # Parse the response JSON + search_results = response.json()["data"] + + # Loop through the search results to find the user with the matching email address + for user in search_results: + if user["email"] == email_address: + # If the email address matches, print the Facebook ID + print("Facebook ID:", user["id"]) + break +else: + # If the request was unsuccessful, print the error message + print("Failed to retrieve data. Response code:", response.status_code) + error = response.json() + error_message = error['error']['message'] + error_code = error['error']['code'] + print('Error Message:', error_message) + print('Error Code:', error_code) \ No newline at end of file diff --git a/get_token.py b/get_token.py new file mode 100644 index 00000000..8e4d8ecb --- /dev/null +++ b/get_token.py @@ -0,0 +1,21 @@ +import requests + +# Replace APP_ID and APP_SECRET with your Facebook app ID and app secret +APP_ID = '586953412840539' +APP_SECRET = 'ca66efcb3b4f8f22b43803e45a51b86f' + +# Define the endpoint for getting an access token +endpoint = f'https://graph.facebook.com/v9.0/oauth/access_token?client_id={APP_ID}&client_secret={APP_SECRET}&grant_type=client_credentials' + +# Make a GET request to the endpoint +response = requests.get(endpoint) + +# Check if the request was successful +if response.status_code == 200: + # If successful, extract the access token from the response + data = response.json() + access_token = data['access_token'] + print('Access Token:', access_token) +else: + # If the request failed, print an error message + print('Request failed with status code:', response.status_code) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..ce33a60e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,23 @@ +appdirs==1.4.4 +certifi==2022.9.24 +click==8.1.3 +Flask==2.2.2 +Flask-Cors==3.0.10 +importlib-metadata==5.0.0 +itsdangerous==2.1.2 +Jinja2==3.1.2 +MarkupSafe==2.1.1 +numpy==1.23.4 +opencv-python==4.6.0.66 +packaging==21.3 +Pillow==9.2.0 +pyee==8.2.2 +pyparsing==3.0.9 +pyppeteer==1.0.2 +pytesseract==0.3.10 +six==1.16.0 +tqdm==4.64.1 +urllib3==1.26.12 +websockets==10.3 +Werkzeug==2.2.2 +zipp==3.10.0 \ No newline at end of file diff --git a/scrap_insta.py b/scrap_insta.py new file mode 100644 index 00000000..23050161 --- /dev/null +++ b/scrap_insta.py @@ -0,0 +1,85 @@ +import asyncio +from pyppeteer import launch +import time +import json +import os +from captcha import get_captcha + +async def get_text(page, selector_path): + element = await page.querySelector(selector_path) + query = '(element) => element.innerText.trim()' + temp = await page.evaluate(query, element) + return temp + + +async def main_scrap(email): + browser = await launch({"headless": False}) + page = await browser.newPage() + await page.goto(url) + + # # take screenshot of captcha + # time.sleep(2) + + #type captcha + time.sleep(2) + # search_box = await page.querySelector('#layers > div:nth-child(2) > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1dqxon3 > div > div.css-1dbjc4n.r-mk0yit.r-1f1sjgu > label > div > div.css-1dbjc4n.r-18u37iz.r-16y2uox.r-1wbh5a2.r-1wzrnnt.r-1udh08x.r-xd6kpl.r-1pn2ns4.r-ttdzmv') + search_box = await page.querySelector('#mount_0_0_gs > div > div > div > div.x9f619.x1n2onr6.x1ja2u2z > div > div > div > div.x78zum5.xdt5ytf.x10cihs4.x1t2pt76.x1n2onr6.x1ja2u2z > section > main > div._ac06.x78zum5.xdt5ytf > div > div > div > div > div:nth-child(4) > form') + # print(search_box) + await search_box.type(email) + + # press enter button + + time.sleep(2) + # await page.click("#layers > div:nth-child(2) > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div.css-1dbjc4n.r-1isdzm1 > div > div > div > div > div") + + await page.click('#layers > div > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div > div > div > div:nth-child(6)') + # page.keypad('Enter') + # time.sleep(10) + login_option = await page.querySelector('#layers > div > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div > div > div > div:nth-child(1)') + time.sleep(2) + # print("login option:", login_option) + + captcha_element = await page.querySelector( + '#layers > div > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div > div > div > div:nth-child(1)' + ) + await captcha_element.screenshot({'path': 'example.png'}) + + + #get captcha from image using pytesseract + captcha = get_captcha('example.png') + + print("captcha:", captcha) + + if "Sign in to Twitter" in captcha: + print("we did not found an twitter account with this id") + else: + print("we found a twitter account with this id") + + + await browser.close() + return captcha + + +def scrape_data(email): + data = asyncio.new_event_loop().run_until_complete(main_scrap(email)) + + try: + os.makedirs("output_data") + except FileExistsError: + # directory already exists + pass + with open('output_data/{}.json'.format(name), 'w') as f: + json.dump(data, f) + + return data + + +if __name__ == '__main__': + url = "https://www.instagram.com/accounts/password/reset/" + name = "sumit jha" + id = "gopal@elintdata.com" + id = "rahulverma.upe@gmail.com" + id = "gopal.kgpian@gmail.com" + id = "6206609503" + + data = scrape_data(id) \ No newline at end of file diff --git a/scrap_twitter.py b/scrap_twitter.py new file mode 100644 index 00000000..021c5cf6 --- /dev/null +++ b/scrap_twitter.py @@ -0,0 +1,85 @@ +import asyncio +from pyppeteer import launch +import time +import json +import os +from captcha import get_captcha + +async def get_text(page, selector_path): + element = await page.querySelector(selector_path) + query = '(element) => element.innerText.trim()' + temp = await page.evaluate(query, element) + return temp + + +async def main_scrap(email): + browser = await launch({"headless": False}) + page = await browser.newPage() + await page.goto(url) + + # # take screenshot of captcha + # time.sleep(2) + + #type captcha + time.sleep(5) + # search_box = await page.querySelector('#layers > div:nth-child(2) > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1dqxon3 > div > div.css-1dbjc4n.r-mk0yit.r-1f1sjgu > label > div > div.css-1dbjc4n.r-18u37iz.r-16y2uox.r-1wbh5a2.r-1wzrnnt.r-1udh08x.r-xd6kpl.r-1pn2ns4.r-ttdzmv') + search_box = await page.querySelector('#layers > div > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div > div > div > div.css-1dbjc4n.r-mk0yit.r-1f1sjgu.r-13qz1uu > label > div > div.css-1dbjc4n.r-18u37iz.r-16y2uox.r-1wbh5a2.r-1wzrnnt.r-1udh08x.r-xd6kpl.r-1pn2ns4.r-ttdzmv > div > input') + # print(search_box) + await search_box.type(email) + + # press enter button + + time.sleep(5) + # await page.click("#layers > div:nth-child(2) > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div.css-1dbjc4n.r-1isdzm1 > div > div > div > div > div") + + await page.click('#layers > div > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div > div > div > div:nth-child(6)') + # page.keypad('Enter') + # time.sleep(10) + login_option = await page.querySelector('#layers > div > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div > div > div > div:nth-child(1)') + time.sleep(5) + # print("login option:", login_option) + + captcha_element = await page.querySelector( + '#layers > div > div > div > div > div > div > div.css-1dbjc4n.r-1awozwy.r-18u37iz.r-1pi2tsx.r-1777fci.r-1xcajam.r-ipm5af.r-g6jmlv > div.css-1dbjc4n.r-1867qdf.r-1wbh5a2.r-kwpbio.r-rsyp9y.r-1pjcn9w.r-1279nm1.r-htvplk.r-1udh08x > div > div > div.css-1dbjc4n.r-14lw9ot.r-6koalj.r-16y2uox.r-1wbh5a2 > div.css-1dbjc4n.r-16y2uox.r-1wbh5a2.r-1jgb5lz.r-1ye8kvj.r-13qz1uu > div > div > div > div:nth-child(1)' + ) + await captcha_element.screenshot({'path': 'example.png'}) + + + #get captcha from image using pytesseract + captcha = get_captcha('example.png') + + print("captcha:", captcha) + + if "Sign in to Twitter" in captcha: + print("we did not found an twitter account with this id") + else: + print("we found a twitter account with this id") + + + await browser.close() + return captcha + + +def scrape_data(email): + data = asyncio.new_event_loop().run_until_complete(main_scrap(email)) + + try: + os.makedirs("output_data") + except FileExistsError: + # directory already exists + pass + with open('output_data/{}.json'.format(name), 'w') as f: + json.dump(data, f) + + return data + + +if __name__ == '__main__': + url = "https://twitter.com/i/flow/login" + name = "sumit jha" + id = "gopal@elintdata.com" + id = "rahulverma.upe@gmail.com" + id = "gopal.kgpian@gmail.com" + id = "9845107111" + + data = scrape_data(id) \ No newline at end of file