ex02(1).py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. import ddddocr
  2. from selenium import webdriver
  3. from selenium.webdriver.common.by import By
  4. from selenium.webdriver.edge.options import Options
  5. from selenium.webdriver.edge.service import Service as EdgeService
  6. import re
  7. import time
  8. def getWeb():
  9. global edge
  10. edge_options = Options()
  11. edge_options.add_experimental_option("excludeSwitches", ["enable-logging'"])
  12. edge_options.add_experimental_option("excludeSwitches", ["enable-automation"])
  13. edge_options.add_experimental_option('useAutomationExtension', False)
  14. #edge_options.add_argument('headless')
  15. service = EdgeService(executable_path=r'D:\222\msedgedriver.exe')
  16. edge = webdriver.Edge(service=service, options=edge_options)
  17. edge.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {"source":
  18. """
  19.         Object.defineProperty(navigator, 'webdriver', {
  20.           get: () => undefined
  21.         })
  22.       """
  23. })
  24. def getSoGou_link(WX_ID):
  25. url = f'https://weixin.sogou.com/weixin?type=1&s_from=input&query={WX_ID}'
  26. edge.get(url)
  27. print(">>> 开始识别验证码")
  28. while True:
  29. try:
  30. edge.find_element(By.CSS_SELECTOR ,' #seccodeImage').screenshot('tempVerification_code.png')
  31. except:
  32. break
  33. ocr = ddddocr.DdddOcr(show_ad=False)
  34. with open('tempVerification_code.png', 'rb') as f:
  35. img_bytes = f.read()
  36. Verification_code = (ocr.classification(img_bytes)).upper()
  37. print('>>> 识别成功!')
  38. print(Verification_code)
  39. edge.find_element(By.NAME, 'c').send_keys(Verification_code)
  40. time.sleep(0.5)
  41. edge.find_element(By.ID, 'submit').click()
  42. time.sleep(1)
  43. try:
  44. edge.find_element(By.ID, 'change-img').click()
  45. time.sleep(0.5)
  46. except:
  47. break
  48. WX_link_list = edge.find_elements(By.XPATH, '//*[@id="sogou_vr_11002301_box_0"]/dl/dd/a')
  49. if len(WX_link_list) == 0 :
  50. return False
  51. return WX_link_list[0].get_attribute('href')
  52. def getWX_html(link):
  53. edge.get(link)
  54. time.sleep(3)
  55. try:
  56. txt = edge.find_element(By.XPATH, '//*[@id="img-content"]').text
  57. print(txt.split('\n'))
  58. except:
  59. pass
  60. try:
  61. img_list = edge.find_elements(By.TAG_NAME, 'img')
  62. for i in img_list:
  63. if i.get_attribute('data-src') != None:
  64. print(i.get_attribute('data-src'))
  65. except:
  66. pass
  67. try:
  68. video_list = edge.find_elements(By.TAG_NAME, 'video')
  69. for i in video_list:
  70. print(i.get_attribute('src'))
  71. except:
  72. pass
  73. if __name__ == "__main__":
  74. with open('id.txt', 'r', encoding='UTF-8') as file:
  75. id_data = file.read()
  76. id = re.compile(r'\'query\' \=\> \'(.*)\'').findall(id_data)
  77. getWeb()
  78. for x in id:
  79. print('--'*60)
  80. link = getSoGou_link(WX_ID=x)
  81. if link == False:
  82. print(">>> Error:没有文章")
  83. continue
  84. print(link)
  85. getWX_html(link=link)