求知

探索知识边界,点亮智慧之光

如何挖掘收录口子之搜狗强制收录法-搜狗网页翻译(17年流行)

搜索杂谈

如何挖掘收录口子之搜狗强制收录法-谷歌/百度收录+搜狗浏览器(18年流行)

搜索杂谈

如何挖掘收录口子之搜狗强制收录法-模拟用户行为,“您可以直接访问 XXX”

搜索杂谈

基于百度开源项目LAC实现批量文本分词

常用工具

多线程百度收录批量查询工具,python

多线程查询百度是否收录,记得替换cookie,好久没更新了,可能会出现验证码,更换cookie就行,不知道能不能用...import requests from queue import Queue from threading import Thread class shoulu(Thread):     seen = set()     result = {}     def __init__(self,k_queue):         super(shoulu, self).__init__()         self.k_queue = k_queue     def run(self):         while True:             url = self.k_queue.get()             headers = {                 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36',                 'Cookie':'PSTM=1636077267; BIDUPSID=6D048EF8EF78EC012B99FCDD1F25E02E; BAIDUID=E88303C13DB2F9BEE9834DF179D2F017:FG=1; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; BDSFRCVID=nW8OJeC62ZhFDlrHgQwNJTVRpe3IPzOTH6aoTDPTyF4Ms1bCCtgWEG0PKM8g0Ku-S2L-ogKK0eOTHkCF_2uxOjjg8UtVJeC6EG0Ptf8g0f5; H_BDCLCKID_SF=JJkO_D_atKvDqTrP-trf5DCShUFstqvdB2Q-XPoO3KJCDpOOyhOjeJ_p3bjNbPQiW5cpoMbgylRp8P3y0bb2DUA1y4vpK-ogQgTxoUJ2fnRJEUcGqj5Ah--ebPRiJPQ9QgbW5hQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0MBCK0hDvPKITD-tFO5eT22-usKerd2hcHMPoosItm04Txyh0SqqrZLp5N-Tnr5D_btMbUoqRHXnJi0btQDPvxBf7pWDTm_q5TtUJMqqOx-JOrqfLn5MOyKMnitIT9-pno0hQrh459XP68bTkA5bjZKxtq3mkjbPbDfn028DKuDjRDKICV-frb-C62aKDsLnbnBhcqJ-ovQT0M04C7ybO2eR3ZQNcNQPb10D_5hUbeWfvpXn-R0hbjJM7xWeJpaJ5nJq5nhMJmKfb2-J0mqto7-P3y523ion5vQpnOEpQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0DjPVKgTa54cbb4o2WbCQyqjM8pcN2b5oQTO30xTnBp32HCQZ-UQ4-PbdEC_RDpOUWfAkXpJvQnJjt2JxaqRCWJ5TMl5jDh3MKftQ-qOdexQ7bIny0hvctn5cShncLUjrDRLbXU6BK5vPbNcZ0l8K3l02V-bIe-t2XjQhDHRabK6aKC5bL6rJabC3MpOcXU6q2bDeQN0JQt6nWNn2sxQFQUQCq-bsL6K5Dp0vWtv4WbbvLT7johRTWqR4OR5JjxonDh83Ktbj5R3dHmT7LnbO5hvvhb5O3M7OLUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_EJ6tOtRKJ_Kv55RrOfjrP-trf5DCShUFs2q-OB2Q-5KL-yJnPsbR4y-JG5UCp3bjNbPLeWGRE2MbdJJjoShbM3brPjMuyDf5m3b3MQ2TxoUJcBCnJhhvGqq-KXKuebPRiJPQ9QgbW5hQ7tt5W8ncFbT7l5hKpbt-q0x-jLTnhVn0M5DK0HPonHj82DTo33J; SIGNIN_UC=70a2711cf1d3d9b1a82d2f87d633bd8a03926704966bhwWmyeOdCL0QjlCuOVvMOxjyrVy4GxZIZIErQWbjcI64aIkE7BGUkWEA%2FWPFwWeA%2FKT0b%2BqUa2wadhpSpKuLKQ%2BBVlWyMZ%2FYgfO5UTDIvy%2Fw%2FdyecSVZI9UToftXBg3MBBw5onaHIrLSajCMUJ17noCz2HFy%2Ba0spwEnZtVqQseGx%2Fes6EzWgCGXigbCIofHCh%2ByAjDGefFcsIs1b40LixDu4UnA6xvH1ojVkQ9dyKeXzoDdAVup6RxeSlKsinCy097%2Bxs6kLX3wJ8QNYATgyhqJU0%2FHiw7Lq%2FGLWkqdHTHrXevYNkbh5JG2VYkl4IxWo%2FbrieowtAaVpnE7TFuzw%3D%3D64136060045377610966832433932427; H_PS_PSSID=35411_35104_31253_35489_34584_35491_35584_34813_35685_35316_26350_35751_22157; delPer=0; PSINO=5; H_WISE_SIDS=107311_110085_114550_127969_184716_186635_186743_186840_186844_188841_189034_189253_189755_190624_190803_191068_191245_191287_191370_192206_193246_193283_193559_194085_194519_195329_195343_195631_196045_196427_196590_197241_197350_197512_197711_197782_197958_198033_198089_198271_198513_198650_199083_199466_199578_199753_199777_199796_199974_200029_200128_200158_200193_200274_200450_200560_200576_200735_200743_201054_201098_201178_201328_201359_201539_201553_201598_201706_201819_201867_201978_8000073_8000104_8000122_8000137_8000150_8000155_8000157_8000173_8000178_8000186; SE_LAUNCH=5%3A27377659; rsv_i=aac9nOCV2qvoxyieGLmVG796qLlT3IQrYSHKt8IluOYm%2FdWUUtCyPvMEO5U8Tt6j56my3RpcXdaGGg%2BWgEsqMu%2Fl0cXPZT8; BD_HOME=1; BD_UPN=12314753; BD_CK_SAM=1; H_PS_645EC=9554Uj3V46WGW%2BlKlqPmBxueMnNiMXk7DGrn7B0QPiYpS6z08f94hLKxBcQ; BA_HECTOR=0g2h2g840kag802g9h1guhvpg0q; BDSVRTM=130; channel=baidusearch; baikeVisitId=200c57ee-4a42-40bc-829c-8774d10ea51c',             }             query = 'http://ipv6.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd={}'.format(url)             resp = requests.get(query,headers=headers).text             print(resp.encode("latin1").decode("utf8"))             try:                 if "请检查您的输入是否正确" in resp:                     print(url,'未收录')                     self.filter(url,'未收录')                 elif "百度为您找到相关结果约" in resp:                     print(url,'已收录')                     self.filter(url,'已收录')                 elif "http://verify.baidu.com" in resp:                     print("查询过程出现验证码")                     self.filter(url,'查询过程出现验证码')                     time.sleep(100)                 else:                     print(url,'未获取源码')                     self.filter(url,'未获取源码')             finally:                 self.k_queue.task_done()     def filter(self,url,sl):         shoulu.result[url] = sl if __name__=="__main__":     k_queue = Queue()     with open('domains.txt',encoding='utf-8') as urls:         for url in urls:             url = url.strip('\n')             k_queue.put(url)             shoulu.seen.add(url)     for i in range(5):         bds = shoulu(k_queue)         bds.setDaemon(True)         bds.start()     k_queue.join()     sort_list = shoulu.result.items()     save = open('baidumobilekey.txt', 'w', encoding='utf-8')     for item in sort_list:         line = '%s\t%s\n' % (item[0], item[1])         save.write(line)         save.flush()     save.close()     print('done,完成查询')
加载更多