7 Replies Latest reply on Jul 30, 2018 10:28 PM by Intel Corporation

    Jupyter hub crashes when running multiprocessing (6cores) script

    amitsome

      Hi, I'm running a very simple script,

      however the entire system crashes after 2-3 minutes of running.

       

      import os

      import json

      import pandas

      import sys

      import multiprocessing as mp

       

       

      def jsons2df(path):

          json_list=[]

          file_list = os.scandir(path)

          counter=0

          error_counter=0

          for j_file in file_list:

              counter+=1

              if counter%10000==0:

                  print(counter)

              #with open(os.path.join(path,j_file),"r") as j:

              with open(j_file.path,"r") as j:

                  try:

                      data = json.load(j)

                  except:

                      print("ERROR: ",j_file)

                      error_counter+=1

                      if error_counter>1000:

                          break

                      else:

                          continue

                  json_list.append(data)

          try:

              json_df= pandas.DataFrame(json_list)

              b=json_df["url"].apply(lambda x: x.split("/")[2])

              json_df['domain']=b

              return json_df

          except:

              return json_list

         

      def get_and_extract(number):

          print("starting: ",number)

          if not os.path.isfile("%s.tar"%(number)):

              if not os.path.isdir("%s"%(number)):

                  if not os.path.isfile("./pickles/wiki_%s.pickle"%(number)):

                      print("downloading")

                      url="http://data.dws.informatik.uni-mannheim.de/webtables/2015-07/englishCorpus/compressed/%s.tar.gz"%(number)

                      os.system("wget %s"%(url))

                      print("extracting gz #1")

                      os.system("tar -xzf %s.tar.gz >/dev/null"%(number))

                      os.system("rm -rf %s.tar.gz"%(number))

       

       

          if  os.path.isfile("%s.tar"%(number)):

              print("extracting tar #2")

              os.system("tar --skip-old-files -xf %s.tar >/dev/null"%(number))

              os.system("rm -rf %s.tar"%(number))

       

       

          if not os.path.isfile("%s.tar"%(number)):

              if number.startswith("0"):

                  number=number[1:]

              path="./%s"%(number)

              if not os.path.isfile("./pickles/wiki_%s.pickle"%(number)):

                  df =jsons2df(path)

                  print("pickling:")

                  df.to_pickle("./pickles/df_%s.pickle"%(number))

                  b=df[df.domain.str.contains("en.wikipedia.org")]

                  b=b[~b.url.str.contains('Special:Book')]

                  b.to_pickle("./pickles/wiki_%s.pickle"%(number))

              if os.path.isdir("%s"%(number)):

                  print("Deleting:")

                  os.system("rsync -r --delete emptydir/ %s/"%(number))

                  os.system("rmdir %s"%(number))

                  print("Done: ",number)   

      #get_and_extract(number)

       

       

      def main(number1,number2):

               

          #pool=mp.Pool(CORES)

          pool = mp.Pool(6)

          print(number1,number2)

          print("Looping:")

          for i in range(int(number1),int(number2)+1):

              print(i)

              x =str(i)

              y=0

              x =str(i)

              y=0

              a=pool.apply_async(get_and_extract,(x,))

              #a=pool.apply_async(get_and_extract,(x,))

          #print(a)

          pool.close()

      """

      if __name__ == '__main__':

          number=sys.argv[1]

          number2=sys.argv[2]

          main(number,number2)

      """