zdavatz / oddb.org

Open Drug Database for Switzerland
https://ch.oddb.org
GNU General Public License v3.0
10 stars 8 forks source link

Swissreg Parser Update #282

Open zdavatz opened 2 weeks ago

zdavatz commented 2 weeks ago
  1. Swissreg Website changed.
  2. Now parsing swissreg with a different tool
  3. See: https://github.com/zdavatz/ts4cpp/issues/16
  4. We need this URL as entry point:
  5. https://www.swissreg.ch/database-client/search/query/certificate-publications
  6. From there we can search by the 5 digit swissmedic Number
zdavatz commented 4 days ago
  1. This script will do the job of finding the correct result, and showing the first correct URL.
  2. copilot_swissreg.rb.txt
require 'selenium-webdriver'

# Check if a five-digit number is provided as a command-line argument
if ARGV.length != 1 || !ARGV[0].match(/\A\d{5}\z/)
  puts "Usage: ruby search_certificates.rb <five-digit-number>"
  exit 1
end

# Extract the five-digit number from the command-line argument
search_number = ARGV[0]

# Set up the Selenium WebDriver (e.g., ChromeDriver)
driver = Selenium::WebDriver.for :chrome
wait = Selenium::WebDriver::Wait.new(timeout: 60) # Increase wait time to 60 seconds

# Navigate to the webpage
driver.get 'https://www.swissreg.ch/database-client/search/query/certificates'

# Function to highlight elements
def highlight_element(driver, element)
  driver.execute_script("arguments[0].style.border='3px solid red'", element)
end

# Retry finding an element to handle stale element reference
def find_element_with_retry(driver, wait, locator)
  attempts = 0
  begin
    attempts += 1
    element = wait.until { driver.find_element(locator) }
    highlight_element(driver, element)
    return element
  rescue Selenium::WebDriver::Error::StaleElementReferenceError
    retry if attempts < 3
    raise
  end
end

# Wait for the search input field to be present using different locators
begin
  search_field = find_element_with_retry(driver, wait, css: 'input[data-cy="search-field-input"]')
rescue Selenium::WebDriver::Error::TimeoutError
  puts "Unable to locate the search input field. Please check the selector and the page structure."
  driver.quit
  exit 1
end

# Enter the search number into the input field
search_field.send_keys(search_number)

# Simulate pressing the Enter key to perform the search
search_field.send_keys(:enter)

# Wait for 10 seconds to allow results to load
sleep 10

# Grab the search results and output the link
begin
  results = driver.find_elements(css: 'ipi-dynamic-result-item-renderer')
  results.each do |result|
    puts result.text
    link = result.find_element(tag_name: 'a')
    puts link.attribute('href')
  end
rescue Selenium::WebDriver::Error::NoSuchElementError
  puts "No results found. Please check the search input and try again."
end

# Close the browser
driver.quit
zdavatz commented 3 days ago

A watir script, that will find the link:

require 'watir'
require 'nokogiri'
require 'logger'

# Configure logging
logger = Logger.new(STDOUT)
logger.level = Logger::DEBUG

# Check if a five-digit number is provided as a command-line argument
if ARGV.length != 1 || !ARGV[0].match(/\A\d{5}\z/)
  puts "Usage: ruby search_certificates.rb <five-digit-number>"
  exit 1
end

# Extract the five-digit number from the command-line argument
search_number = ARGV[0]

begin
  # Set up the Watir browser
  browser = Watir::Browser.new :chrome, headless: true

  # Attempt to navigate to the webpage with error handling
  logger.info "Attempting to access the webpage"
  browser.goto('https://www.swissreg.ch/database-client/search/query/certificates')

  # Wait for the search input field to be present
  logger.info "Waiting for the search input field to be present..."
  browser.text_field(data_cy: 'search-field-input').wait_until(&:present?)

  # Enter the search number into the search input field
  logger.info "Entering the search number: #{search_number}"
  browser.text_field(data_cy: 'search-field-input').set(search_number)

  # Submit the search form
  logger.info "Submitting the search form..."
  browser.send_keys :enter

  # Wait for the search results to load
  logger.info "Waiting for the search results to load..."
  sleep(10) # Adjust the sleep time as needed

  # Get the page HTML
  page_html = browser.html

  # Parse the page content with Nokogiri
  doc = Nokogiri::HTML(page_html)

  # Look for Angular-specific elements and classes
  angular_elements = doc.css('div.ipi-detail-link.ng-star-inserted')
  logger.debug "Angular Elements Found: #{angular_elements.length}"

  # Look for input fields with Angular attributes
  ng_input_fields = doc.css('input[ng-reflect-name]')
  logger.debug "Angular Input Fields: #{ng_input_fields.length}"
  ng_input_fields.each do |input|
    logger.debug "Input Field: name='#{input['ng-reflect-name']}', type='#{input['type']}'"
  end

  # Additional Angular-specific searches
  ng_search_elements = doc.css('[data-cy="search-field-input"]')
  logger.debug "Search Field Elements: #{ng_search_elements.length}"

  # Detailed debugging of page content
  logger.debug "Page Title: #{doc.title}"

  # Extract and log JavaScript content
  js_scripts = doc.css('script[type="text/javascript"]')
  logger.debug "JavaScript Scripts Found: #{js_scripts.length}"
  js_scripts.each do |script|
    logger.debug "JavaScript Content: #{script.content}"
  end

  # If no elements found, log full document
  if angular_elements.empty? && ng_input_fields.empty? && ng_search_elements.empty?
    logger.error "No Angular elements found. Possible rendering issue."
    logger.debug "Full Document Preview:"
    logger.debug page_html[0..1000] # First 1000 characters
    exit 1
  end

  # Note: This script cannot fully interact with JavaScript-rendered content
  logger.warn "WARNING: This script may not fully interact with JavaScript-rendered pages."
  logger.warn "Consider using a tool like Selenium for complete interaction."

  # Output debug information about the page
  puts "Page investigation complete. Unable to fully process dynamic content."

  # Find the mat-sidenav-content element
  sidenav_content = doc.at_css('mat-sidenav-content')

  if sidenav_content
    logger.debug "mat-sidenav-content Found"
    # Output the content of mat-sidenav-content
    logger.debug "mat-sidenav-content HTML: #{sidenav_content.to_html}"
  else
    logger.error "mat-sidenav-content not found"
  end

  # Find the link using a CSS selector
  link = doc.at('a.ipi-detail-link.ng-star-inserted')

  if link
    logger.debug "Link Found: #{link.text}"
    logger.debug "Link URL: #{link['href']}"

    # Abort the script after finding the link
    logger.info "Link found. Aborting the script."
    exit 0
  else
    logger.error "Link not found"
  end

rescue StandardError => e
  logger.error "An error occurred:"
  logger.error e.message
  logger.error e.backtrace.join("\n")
  exit 1
ensure
  # Close the browser
  browser.close
end
zdavatz commented 3 days ago

A script that will use an input file:

require 'watir'
require 'nokogiri'
require 'logger'

# Configure logging
logger = Logger.new(STDOUT)
logger.level = Logger::DEBUG

# Check if the file path is provided as a command-line argument
if ARGV.length != 1
  puts "Usage: ruby search_certificates.rb <file_path>"
  exit 1
end

# Read the list of five-digit numbers from the file
file_path = ARGV[0]
search_numbers = File.readlines(file_path).map(&:chomp)

begin
  # Set up the Watir browser
  browser = Watir::Browser.new :chrome, headless: true

  search_numbers.each do |search_number|
    # Attempt to navigate to the webpage with error handling
    logger.info "Attempting to access the webpage for search number: #{search_number}"
    browser.goto('https://www.swissreg.ch/database-client/search/query/certificates')

    # Wait for the search input field to be present
    logger.info "Waiting for the search input field to be present..."
    browser.text_field(data_cy: 'search-field-input').wait_until(&:present?)

    # Enter the search number into the search input field
    logger.info "Entering the search number: #{search_number}"
    search_field = browser.text_field(data_cy: 'search-field-input')
    search_field.set(search_number)

    # Submit the search form by pressing Enter
    logger.info "Submitting the search form..."
    search_field.send_keys :enter

    # Wait for the search results to load
    logger.info "Waiting for the search results to load..."
    sleep(1) # Wait for 2 seconds

    # Get the page HTML
    page_html = browser.html

    # Parse the page content with Nokogiri
    doc = Nokogiri::HTML(page_html)

    # Find the link using a CSS selector
    link = doc.at('a.ipi-detail-link.ng-star-inserted')

    if link
      logger.debug "Link Found: #{link.text}"
      logger.debug "Link URL: #{link['href']}"

      # Output the link information
      puts "Link Found for search number #{search_number}: #{link.text} - #{link['href']}"
    else
      logger.error "Link not found for search number #{search_number}"
    end
  end

rescue StandardError => e
  logger.error "An error occurred:"
  logger.error e.message
  logger.error e.backtrace.join("\n")
  exit 1
ensure
  # Close the browser
  browser.close
end
zdavatz commented 1 day ago

I think we can go with manually editable fields here, as there are only about 20 active patents.