ReactionMechanismGenerator / ARC

ARC - Automatic Rate Calculator
https://reactionmechanismgenerator.github.io/ARC/index.html
MIT License
43 stars 21 forks source link

A network error causes ARC to crush #42

Closed alongd closed 5 years ago

alongd commented 5 years ago

When the machine sending command to the server looses connectivity, ARC crashes wirh:

"""---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
<ipython-input-5-f62c9e69185c> in <module>()
      1 arc0 = arc.ARC(project='Arc-cooptima-methylpropylether-cbsqb3', composite_method='cbs-qb3', rmg_species_list=[], arc_species_list=arc_species_list,ess_settings = {'gaussian': 'c3ddb', 'molpro': 'pharos', 'qchem': 'pharos'})
----> 2 arc0.execute()
/home/dranasinghe/Software/ARC/arc/main.pyc in execute(self)
    234                                    scan_level=self.scan_level, fine=self.fine, settings=self.settings,
    235                                    generate_conformers=self.generate_conformers, scan_rotors=self.scan_rotors,
--> 236                                    initial_trsh=self.initial_trsh)
    237         prc = Processor(project=self.project, species_dict=self.scheduler.species_dict, output=self.scheduler.output,
    238                         use_bac=self.use_bac, model_chemistry=self.model_chemistry)
/home/dranasinghe/Software/ARC/arc/scheduler.pyc in __init__(self, project, settings, species_list, composite_method, conformer_level, opt_level, freq_level, sp_level, scan_level, fine, generate_conformers, scan_rotors, initial_trsh)
    151                 self.species_dict[species.label].generate_conformers()
    152         self.timer = True
--> 153         self.schedule_jobs()
    154 
    155     def schedule_jobs(self):
/home/dranasinghe/Software/ARC/arc/scheduler.pyc in schedule_jobs(self)
    261                             and not self.job_dict[label]['scan'][job_name].job_id in self.servers_jobs_ids:
    262                         job = self.job_dict[label]['scan'][job_name]
--> 263                         successful_server_termination = self.end_job(job=job, label=label, job_name=job_name)
    264                         if successful_server_termination:
    265                             self.check_scan_job(label=label, job=job)
/home/dranasinghe/Software/ARC/arc/scheduler.pyc in end_job(self, job, label, job_name)
    325                          fine=job.fine, software=job.software, shift=job.shift, trsh=job.trsh, memory=job.memory,
    326                          conformer=job.conformer, ess_trsh_methods=job.ess_trsh_methods, scan=job.scan,
--> 327                          pivots=job.pivots, occ=job.occ)
    328         self.running_jobs[label].pop(self.running_jobs[label].index(job_name))
    329         self.timer = False
/home/dranasinghe/Software/ARC/arc/scheduler.pyc in run_job(self, label, xyz, level_of_theory, job_type, fine, software, shift, trsh, memory, conformer, ess_trsh_methods, scan, pivots, occ)
    303                 self.job_dict[label][job_type] = dict()
    304             self.job_dict[label][job_type][job.job_name] = job
--> 305             self.job_dict[label][job_type][job.job_name].run()
    306         else:
    307             # Running a conformer job. Append differently to job_dict.
/home/dranasinghe/Software/ARC/arc/job/job.pyc in run(self)
    508             logging.info('Running job {name} for {label}'.format(name=self.job_name, label=self.species_name))
    509         logging.debug('writing submit script...')
--> 510         self.write_submit_script()
    511         logging.debug('writing input file...')
    512         self.write_input_file()
/home/dranasinghe/Software/ARC/arc/job/job.pyc in write_submit_script(self)
    285             f.write(self.submit)
    286         if self.settings['ssh']:
--> 287             self._upload_submit_file()
    288 
    289     def write_input_file(self):
/home/dranasinghe/Software/ARC/arc/job/job.pyc in _upload_submit_file(self)
    483         ssh.send_command_to_server(command='mkdir -p {0}'.format(self.remote_path))
    484         remote_file_path = os.path.join(self.remote_path, submit_filename[servers[self.server]['cluster_soft']])
--> 485         ssh.upload_file(remote_file_path=remote_file_path, file_string=self.submit)
    486 
    487     def _upload_input_file(self):
/home/dranasinghe/Software/ARC/arc/job/ssh.pyc in upload_file(self, remote_file_path, local_file_path, file_string)
     64             raise InputError('Cannot upload a non-existing file.'
     65                              ' Check why file in path {0} is missing.'.format(local_file_path))
---> 66         sftp, ssh = self.connect()
     67         with sftp.open(remote_file_path, "w") as f_remote:
     68             if file_string:
/home/dranasinghe/Software/ARC/arc/job/ssh.pyc in connect(self)
    168         ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
    169         ssh.load_system_host_keys(filename=self.key)
--> 170         ssh.connect(hostname=self.address, username=self.un)
    171         sftp = ssh.open_sftp()
    172         return sftp, ssh
/home/dranasinghe/anaconda2/envs/rmg_env/lib/python2.7/site-packages/paramiko/client.pyc in connect(self, hostname, port, username, password, pkey, key_filename, timeout, allow_agent, look_for_keys, compress, sock, gss_auth, gss_kex, gss_deleg_creds, gss_host, banner_timeout, auth_timeout, gss_trust_dns, passphrase)
    332             errors = {}
    333             # Try multiple possible address families (e.g. IPv4 vs IPv6)
--> 334             to_try = list(self._families_and_addresses(hostname, port))
    335             for af, addr in to_try:
    336                 try:
/home/dranasinghe/anaconda2/envs/rmg_env/lib/python2.7/site-packages/paramiko/client.pyc in _families_and_addresses(self, hostname, port)
    202         guess = True
    203         addrinfos = socket.getaddrinfo(
--> 204             hostname, port, socket.AF_UNSPEC, socket.SOCK_STREAM
    205         )
    206         for (family, socktype, proto, canonname, sockaddr) in addrinfos:
gaierror: [Errno -2] Name or service not known
"""

or:

Traceback (most recent call last):
  File "/home/alongd/Code/ARC//ARC.py", line 95, in <module>
    main()
  File "/home/alongd/Code/ARC//ARC.py", line 78, in main
    arc_object.execute()
  File "/home/alongd/Code/ARC/arc/main.py", line 413, in execute
    project_directory=self.project_directory)
  File "/home/alongd/Code/ARC/arc/scheduler.py", line 175, in __init__
    self.schedule_jobs()
  File "/home/alongd/Code/ARC/arc/scheduler.py", line 253, in schedule_jobs
    successful_server_termination = self.end_job(job=job, label=label, job_name=job_name)
  File "/home/alongd/Code/ARC/arc/scheduler.py", line 349, in end_job
    pivots=job.pivots, occ=job.occ)
  File "/home/alongd/Code/ARC/arc/scheduler.py", line 327, in run_job
    self.job_dict[label][job_type][job.job_name].run()
  File "/home/alongd/Code/ARC/arc/job/job.py", line 507, in run
    self.write_submit_script()
  File "/home/alongd/Code/ARC/arc/job/job.py", line 284, in write_submit_script
    self._upload_submit_file()
  File "/home/alongd/Code/ARC/arc/job/job.py", line 482, in _upload_submit_file
    ssh.upload_file(remote_file_path=remote_file_path, file_string=self.submit)
  File "/home/alongd/Code/ARC/arc/job/ssh.py", line 66, in upload_file
    sftp, ssh = self.connect()
  File "/home/alongd/Code/ARC/arc/job/ssh.py", line 180, in connect
    ssh.connect(hostname=self.address, username=self.un)
  File "/home/alongd/anaconda2/envs/rmg_env/lib/python2.7/site-packages/paramiko/client.py", line 334, in connect
    to_try = list(self._families_and_addresses(hostname, port))
  File "/home/alongd/anaconda2/envs/rmg_env/lib/python2.7/site-packages/paramiko/client.py", line 204, in _families_and_addresses
    hostname, port, socket.AF_UNSPEC, socket.SOCK_STREAM
socket.gaierror: [Errno -2] Name or service not known

or:

---------------------------------------------------------------------------
gaierror                                  Traceback (most recent call last)
<ipython-input-3-580c043d58f4> in <module>()
      3 #                     fine=True, generate_conformers=True, scan_rotors=True, use_bac=True, model_chemistry
      4 arc0 = arc.ARC(project='ArcDemo', rmg_species_list=rmg_species_list, arc_species_list=arc_species_list)
----> 5 arc0.execute()

/home/alongd/Code/ARC/arc/main.pyc in execute(self)
    411                                    generate_conformers=self.generate_conformers, scan_rotors=self.scan_rotors,
    412                                    initial_trsh=self.initial_trsh, restart_dict=self.restart_dict,
--> 413                                    project_directory=self.project_directory)
    414         prc = Processor(project=self.project, species_dict=self.scheduler.species_dict, output=self.scheduler.output,
    415                         use_bac=self.use_bac, model_chemistry=self.model_chemistry)

/home/alongd/Code/ARC/arc/scheduler.pyc in __init__(self, project, settings, species_list, composite_method, conformer_level, opt_level, freq_level, sp_level, scan_level, project_directory, fine, generate_conformers, scan_rotors, initial_trsh, restart_dict)
    173                 self.species_dict[species.label].generate_conformers()
    174         self.timer = True
--> 175         self.schedule_jobs()
    176 
    177     def schedule_jobs(self):

/home/alongd/Code/ARC/arc/scheduler.pyc in schedule_jobs(self)
    180         """
    181         if self.generate_conformers:
--> 182             self.run_conformer_jobs()
    183         while self.running_jobs != {}:  # loop while jobs are still running
    184             logging.debug('Currently running jobs:\n{0}'.format(self.running_jobs))

/home/alongd/Code/ARC/arc/scheduler.pyc in run_conformer_jobs(self)
    370                     for i, xyz in enumerate(self.species_dict[label].conformers):
    371                         self.run_job(label=label, xyz=xyz, level_of_theory=self.conformer_level, job_type='conformer',
--> 372                                      conformer=i)
    373                 else:
    374                     if 'opt' not in self.job_dict[label] and 'composite' not in self.job_dict[label]\

/home/alongd/Code/ARC/arc/scheduler.pyc in run_job(self, label, xyz, level_of_theory, job_type, fine, software, shift, trsh, memory, conformer, ess_trsh_methods, scan, pivots, occ)
    330             self.running_jobs[label].append('conformer{0}'.format(conformer))  # mark as a running job
    331             self.job_dict[label]['conformers'][conformer] = job  # save job object
--> 332             self.job_dict[label]['conformers'][conformer].run()  # run the job
    333         if job.server not in self.servers:
    334             self.servers.append(job.server)

/home/alongd/Code/ARC/arc/job/job.pyc in run(self)
    507         self.write_submit_script()
    508         logging.debug('writing input file...')
--> 509         self.write_input_file()
    510         if self.settings['ssh']:
    511             ssh = SSH_Client(self.server)

/home/alongd/Code/ARC/arc/job/job.pyc in write_input_file(self)
    474             f.write(self.input)
    475         if self.settings['ssh']:
--> 476             self._upload_input_file()
    477 
    478     def _upload_submit_file(self):

/home/alongd/Code/ARC/arc/job/job.pyc in _upload_input_file(self)
    486         ssh.send_command_to_server(command='mkdir -p {0}'.format(self.remote_path))
    487         remote_file_path = os.path.join(self.remote_path, input_filename[self.software])
--> 488         ssh.upload_file(remote_file_path=remote_file_path, file_string=self.input)
    489 
    490     def _download_output_file(self):

/home/alongd/Code/ARC/arc/job/ssh.pyc in upload_file(self, remote_file_path, local_file_path, file_string)
     64             raise InputError('Cannot upload a non-existing file.'
     65                              ' Check why file in path {0} is missing.'.format(local_file_path))
---> 66         sftp, ssh = self.connect()
     67         with sftp.open(remote_file_path, "w") as f_remote:
     68             if file_string:

/home/alongd/Code/ARC/arc/job/ssh.pyc in connect(self)
    178             # This sometimes gives "SSHException: Error reading SSH protocol banner[Error 104] Connection reset by peer"
    179             # Try again:
--> 180             ssh.connect(hostname=self.address, username=self.un)
    181         sftp = ssh.open_sftp()
    182         return sftp, ssh

/home/alongd/anaconda2/envs/rmg_env/lib/python2.7/site-packages/paramiko/client.pyc in connect(self, hostname, port, username, password, pkey, key_filename, timeout, allow_agent, look_for_keys, compress, sock, gss_auth, gss_kex, gss_deleg_creds, gss_host, banner_timeout, auth_timeout, gss_trust_dns, passphrase)
    332             errors = {}
    333             # Try multiple possible address families (e.g. IPv4 vs IPv6)
--> 334             to_try = list(self._families_and_addresses(hostname, port))
    335             for af, addr in to_try:
    336                 try:

/home/alongd/anaconda2/envs/rmg_env/lib/python2.7/site-packages/paramiko/client.pyc in _families_and_addresses(self, hostname, port)
    202         guess = True
    203         addrinfos = socket.getaddrinfo(
--> 204             hostname, port, socket.AF_UNSPEC, socket.SOCK_STREAM
    205         )
    206         for (family, socktype, proto, canonname, sockaddr) in addrinfos:

gaierror: [Errno -3] Temporary failure in name resolution
alongd commented 5 years ago

closed via #43