123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113 |
- import ddddocr
- from selenium import webdriver
- from selenium.webdriver.common.by import By
- from selenium.webdriver.edge.options import Options
- from selenium.webdriver.edge.service import Service as EdgeService
- import re
- import time
- def getWeb():
-
- global edge
- edge_options = Options()
- edge_options.add_experimental_option("excludeSwitches", ["enable-logging'"])
- edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
- edge_options.add_experimental_option('useAutomationExtension', False)
- #edge_options.add_argument('headless')
- service = EdgeService(executable_path=r'D:\222\msedgedriver.exe')
- edge = webdriver.Edge(service=service, options=edge_options)
- edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source":
- """
- Object.defineProperty(navigator, 'webdriver', {
- get: () => undefined
- })
- """
- })
-
- def getSoGou_link(WX_ID):
- url = f'https://weixin.sogou.com/weixin?type=1&s_from=input&query={WX_ID}'
- edge.get(url)
- print(">>> 开始识别验证码")
- while True:
- try:
- edge.find_element(By.CSS_SELECTOR ,' #seccodeImage').screenshot('tempVerification_code.png')
- except:
- break
- ocr = ddddocr.DdddOcr(show_ad=False)
- with open('tempVerification_code.png', 'rb') as f:
- img_bytes = f.read()
- Verification_code = (ocr.classification(img_bytes)).upper()
- print('>>> 识别成功!')
- print(Verification_code)
- edge.find_element(By.NAME, 'c').send_keys(Verification_code)
- time.sleep(0.5)
- edge.find_element(By.ID, 'submit').click()
- time.sleep(1)
- try:
- edge.find_element(By.ID, 'change-img').click()
- time.sleep(0.5)
- except:
- break
- WX_link_list = edge.find_elements(By.XPATH, '//*[@id="sogou_vr_11002301_box_0"]/dl/dd/a')
-
- if len(WX_link_list) == 0 :
- return False
- return WX_link_list[0].get_attribute('href')
- def getWX_html(link):
- edge.get(link)
- time.sleep(3)
- try:
- txt = edge.find_element(By.XPATH, '//*[@id="img-content"]').text
- print(txt.split('\n'))
- except:
- pass
- try:
- img_list = edge.find_elements(By.TAG_NAME, 'img')
- for i in img_list:
- if i.get_attribute('data-src') != None:
- print(i.get_attribute('data-src'))
- except:
- pass
-
- try:
- video_list = edge.find_elements(By.TAG_NAME, 'video')
- for i in video_list:
- print(i.get_attribute('src'))
- except:
- pass
-
-
- if __name__ == "__main__":
- with open('id.txt', 'r', encoding='UTF-8') as file:
- id_data = file.read()
- id = re.compile(r'\'query\' \=\> \'(.*)\'').findall(id_data)
- getWeb()
- for x in id:
- print('--'*60)
- link = getSoGou_link(WX_ID=x)
- if link == False:
- print(">>> Error:没有文章")
- continue
- print(link)
- getWX_html(link=link)
-
-
-
-
|