103 lines
4.8 KiB
Python
103 lines
4.8 KiB
Python
|
|
import json
|
|||
|
|
import time
|
|||
|
|
import requests
|
|||
|
|
import socket
|
|||
|
|
import socks
|
|||
|
|
import urllib.parse
|
|||
|
|
from datetime import datetime
|
|||
|
|
|
|||
|
|
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 1080)
|
|||
|
|
socket.socket = socks.socksocket
|
|||
|
|
tag_name = 'برنامه_نویسی'
|
|||
|
|
|
|||
|
|
|
|||
|
|
def getTopPosts():
|
|||
|
|
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
|
|||
|
|
'Accept': '*/*',
|
|||
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|||
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|||
|
|
'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ',
|
|||
|
|
'X-IG-App-ID': '936619743392459',
|
|||
|
|
'X-ASBD-ID': '198387',
|
|||
|
|
'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6',
|
|||
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|||
|
|
'Connection': 'keep-alive',
|
|||
|
|
'Referer': 'https://www.instagram.com/explore/tags/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D9%87_%D9%86%D9%88%DB%8C%D8%B3%DB%8C/',
|
|||
|
|
'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933636:01f75496bea7dd870ec16fa6f07ff96b9d518c7d09d2d1b3c6b84cdcbd06689e2ae5bc1e"',
|
|||
|
|
'Sec-Fetch-Dest': 'empty',
|
|||
|
|
'Sec-Fetch-Mode': 'cors',
|
|||
|
|
'Sec-Fetch-Site': 'same-origin',
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
params = {
|
|||
|
|
'tag_name': tag_name,
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
response = requests.get(
|
|||
|
|
'https://www.instagram.com/api/v1/tags/web_info/', params=params, headers=headers)
|
|||
|
|
return response.json()
|
|||
|
|
|
|||
|
|
|
|||
|
|
def getComments(min_id: str, post_pk):
|
|||
|
|
try:
|
|||
|
|
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&min_id={urllib.parse.quote(str(min_id).encode())}"
|
|||
|
|
if (min_id.startswith("permalink")):
|
|||
|
|
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&permalink_enabled=false"
|
|||
|
|
|
|||
|
|
headers = {
|
|||
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
|
|||
|
|
'Accept': '*/*',
|
|||
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|||
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|||
|
|
'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ',
|
|||
|
|
'X-IG-App-ID': '936619743392459',
|
|||
|
|
'X-ASBD-ID': '198387',
|
|||
|
|
'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6',
|
|||
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|||
|
|
'Connection': 'keep-alive',
|
|||
|
|
'Referer': 'https://www.instagram.com/p/CqkUyB6ICbR/',
|
|||
|
|
'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933653:01f7eaf265c5347c6a98150aed096c93c7f28416075157f5491744f72be82f92a96b225b"',
|
|||
|
|
'Sec-Fetch-Dest': 'empty',
|
|||
|
|
'Sec-Fetch-Mode': 'cors',
|
|||
|
|
'Sec-Fetch-Site': 'same-origin',
|
|||
|
|
|
|||
|
|
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
response = requests.request("GET", url, headers=headers)
|
|||
|
|
data = response.json()
|
|||
|
|
return data
|
|||
|
|
except:
|
|||
|
|
return False
|
|||
|
|
|
|||
|
|
|
|||
|
|
data = getTopPosts()
|
|||
|
|
grids = data['data']['top']['sections']
|
|||
|
|
comments = []
|
|||
|
|
mediacounter = 0
|
|||
|
|
for grid in grids:
|
|||
|
|
medias = grid['layout_content']['medias']
|
|||
|
|
for media in medias:
|
|||
|
|
next_cursor = 'permalink_enabled=false'
|
|||
|
|
|
|||
|
|
while True:
|
|||
|
|
data = getComments(min_id=next_cursor,
|
|||
|
|
post_pk=media['media']['pk'])
|
|||
|
|
if data is not False:
|
|||
|
|
next_cursor = data['next_min_id'] if 'next_min_id' in data else None
|
|||
|
|
comments += data['comments']
|
|||
|
|
time.sleep(5)
|
|||
|
|
if next_cursor is None or 'cached_comments_cursor' not in next_cursor:
|
|||
|
|
mediacounter += 1
|
|||
|
|
print(mediacounter)
|
|||
|
|
break
|
|||
|
|
print(f"comments counts: {len(comments)}")
|
|||
|
|
|
|||
|
|
current = datetime.now()
|
|||
|
|
f = open(
|
|||
|
|
f"{tag_name}-{current.strftime('%Y-%m-%d-T-%H-%M-%S')}-{media['media']['pk']}-scrapped.json", "w")
|
|||
|
|
f.write(json.dumps(comments))
|
|||
|
|
f.close()
|