NetSerf/ 40775 764 764 0 6544252204 10105 5ustar jaejaeNetSerf/serf.py100775 764 764 16265 6544252005 11547 0ustar jaejae#!/usr/bin/python -O # # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import cgi,shelve,sys,urlparse path = '/home/httpd/cgi-bin/spider' sys.path.append(path) from snet import Snet,NET_SHELF IMP_URL_SHELF = path + '/imp_url.shelf' # def start_page(): url_shelf = shelve.open(IMP_URL_SHELF) if url_shelf: topics = url_shelf.keys() else: topics = () print "Content-type: text/html" print print "" print "Net Serf Demo" print "" print "" print "
" print "

Net Serf Demo Start Page

" print "
" print "

Introduction

" print "Welcome to the Net Serf demo. This demo was built ONLY as" print "a testing mechanism for the semantic network backend, so" print "don't expect any pretty logos or even tons of usable" print "functionality at this point. Keeping this in mind, feel free" print "to play with the system, and be sure to let" print "me know what you" print "think.

" print "To start with this system, either choose one of the existing" print "topics immediately below, or create your own in the text" print "field provided." print "
" print "

Existing Topics

" print "
" print "

Start a new topic

" print "
" # def topic_page(form): topic = form['do_topic'].value url_shelf = shelve.open(IMP_URL_SHELF) if url_shelf.has_key(topic): imp_url_d = url_shelf[topic] else: imp_url_d = {} t_url_list = [] for url,tags in imp_url_d.items(): if 'start' in tags: t_url_list.append(url) print "Content-type: text/html" print print "" print "Net Serf Demo" print "" print "
" print "

Net Serf Demo Topics Page

" print "Use any of these search engines or use one of the links" print "below as your starting point. Just as a hint, webcrawler" print "and google tend to work the best.

" print "NOTE: This may take a while, go get a cup of coffee after submitting your query.
" print "Topic: %s
" % (topic,) print "" % (topic,) print "

Search Engines

" print "" print "" print "" print "" print "" print "

" print "Here you have the option of giving the system its initial set" print "of keywords if this is a new topic or if this is an already" print "existing topic, you can use this to enter new keywords." print "

" print "Keywords for search:
" print "

Previously saved starting sites

" print "These are meant to be used as the starting sites after the" print "system has been 'primed' with the search engines." print "" print "" print "
" # def results_page(form): print "Content-type: text/html" print print "" print "Net Serf Demo" print "" print "
" print "

Net Serf Demo Results Page

" start_site = form['start'].value if form.has_key('keywords'): keywords = form['keywords'].value else: keywords = '' topic = form['topic'].value print "" % (topic,) message = { 'topic':topic } if start_site in ('Altavista','Infoseek','Webcrawler','Google'): message['engine'] = start_site if keywords: message['keywords'] = keywords else: url_tuple = urlparse.urlparse(start_site) message['machine'] = url_tuple[1] message['uri'] = url_tuple[2] import Spider spider = Spider.Spider() spider.send_request('Butler',message) spider.loop() print "" print "
" # def do_scores(form): scores = {} message = {} imp_urls = {} topic = form['topic'].value for key in form.keys(): if key == 'topic' or key == 'scores': continue scores[key] = form[key].value message['topic'] = topic message['scores'] = scores for url,score in scores.items(): if score == '0.99': imp_urls[url] = ('start','watch') elif score == '0.66': imp_urls[url] = ('watch',) elif score == '0.33': imp_urls[url] = ('start',) url_shelf = shelve.open(IMP_URL_SHELF) if url_shelf.has_key(topic): url_dict = url_shelf[topic] else: url_dict = {} url_dict.update(imp_urls) url_shelf[topic] = url_dict url_shelf.close() import Spider spider = Spider.Spider() spider.send_request('Profiler',message) spider.loop() # def show_net(form): topic = form['topic'].value net_shelf = shelve.open(NET_SHELF) if net_shelf.has_key(topic): net = net_shelf[topic] else: print "Content-type: text/html" print print "Error: No net found" return print "Content-type: text/html" print print "" print "Net Serf Demo" print "" print "
" net.html_show() print "
" print "" # if __name__ == '__main__': form = cgi.FieldStorage() if form.has_key('start'): results_page(form) elif form.has_key('do_topic'): topic_page(form) elif form.has_key('scores'): do_scores(form) start_page() elif form.has_key('show'): show_net(form) else: start_page() NetSerf/Butler.py100644 764 764 2431 6544252016 12010 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import bot import url_finder,url_hound bot.Butler_agent_classes = [url_finder.url_finder,url_hound.url_hound] class Butler(bot.Menski): def set_goals(self): self.req_subgoals.append('url_finder') if __name__ == '__main__': butTest = Butler() message = { 'topic':'linux', 'keywords':['+linux +free -microsoft','+"operating system"'] } butTest.send_request('url_finder',message) # import profile # profile.run('butTest()') butTest() NetSerf/Processor.py100644 764 764 2604 6544252016 12534 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import bot import html_parser,extractor bot.Processor_agent_classes = [html_parser.html_parser,extractor.extractor] class Processor(bot.Menski): def set_goals(self): self.req_subgoals.append('html_parser') if __name__ == '__main__': bot.Processor_agent_classes = [html_parser.html_parser,extractor.extractor] proTest = Processor() proTest.send_request('html_parser',{'url':'http://kali.mirage.org/projects.html', 'topic':'linux', 'page':open('./ldp.html').read()}) ## import profile ## profile.run('proTest()') proTest.loop() NetSerf/Profiler.py100644 764 764 4073 6544252016 12341 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import bot import snet_builder,snet_maintainer,reporter,display bot.Profiler_agent_classes = [ snet_builder.snet_builder, reporter.reporter, snet_maintainer.snet_maintainer, display.display ] class Profiler(bot.Menski): def _proxy(self,message): # Get message info from either the url-fetcher (if), # display-agent (if), or the snet-builder (elif) if message.has_key('keywords'): message['to_name'] = 'snet_builder' else: message['to_name'] = 'reporter' return message if __name__ == '__main__': proTest = Profiler() urllist = ['www.ai.uga.edu/~jae/linux.html','www.linux.org','www.ssc.com/linux'] url_message = { 'topic':'linux', 'urllist':urllist } bld_message1 = { 'topic':'linux', 'url':'www.ai.uga.edu/~jae/linux.html', 'snet':net1 } bld_message2 = { 'topic':'linux', 'url':'www.ai.uga.edu/~jae/linux.html', 'snet':net2 } bld_message3 = { 'topic':'linux', 'url':'www.ai.uga.edu/~jae/linux.html', 'snet':net3 } proTest.send_request( 'html_parser', {'url':'http://kali.mirage.org/projects.html', 'topic':'linux', 'page':open('/home/httpd/html/ai-full.html').read()}) ## import profile ## profile.run('proTest()') proTest() NetSerf/Spider.py100644 764 764 2367 6544252016 12011 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import bot import Processor,Profiler,Butler bot.Spider_agent_classes = [ Processor.Processor, Profiler.Profiler, Butler.Butler ] class Spider(bot.Menski): pass if __name__ == '__main__': spiderTest = Spider() message = { 'topic':'linux', 'keywords':['+linux +documentation'] } spiderTest.send_request('Butler',message) spiderTest.loop() ## import profile ## profile.run('proTest()') NetSerf/bb.py100644 764 764 10006 6544252016 11153 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # class BlackBoard: """ Black Board (BB) object used for agent messaging. Provides the following methods: update(other_bb) [aliased to update_local and update_parent] ->Merges the different messages in other_bb into BB. activity_test() ->test to see if agents were active this cycle. request_test(agent_name) ->Agent looks at BB for requests it can handle. state_test(agent_states) ->Agent looks at BB for states it reacts to. post(list_o_messages) ->Agent posts a list of messages to the BB. """ def __init__(self): self.bb = [] self.pbb = None # def __getitem__(self,i): return self.bb[i] # Probably not needed def update_local(self,pbb): """ Update the black board with new messages from either the parent or child black board. """ if pbb == self.bb: return # if message is in pbb and not in the lbb, add it to the lbb for message in pbb: if message not in self.bb: self.bb.append(message) ## # if message is not in bpp and is in lbb, remove it from lbb ## # unless it belongs to one of the collective of that menski ## for message in self.bb: ## if message not in pbb: # and message not in collective ## # if it doesn't belong to a local ao ## self.bb.remove(message) def update(self,bb): """ Update the black board with new messages from either the parent or child black board. """ if bb == self.bb: return for message in bb: if message not in self.bb: self.bb.append(message) # Probably not needed def activity_test(self): """ Have the agents done anything this cycle. """ flag = 0 for message in self.bb: if message.has_key('inform'): self.bb.remove([message]) flag = 1 return flag # def message_test(self,agent): """ Check BB for requests to agent_name. """ msg_list = [] for message in self.bb: if message.get('to_id') == agent.id or message.get('to_name') == agent.name: if agent.__type__ == 'Menski': msg_list.append(message) if __debug__: print agent.name, "accepts Message:", print message.keys(), print message['to_name'] else: if __debug__: print agent.name, "accepts Message:", print message.keys(), print message['to_name'] self.remove([message]) return message if agent.__type__ == "AO": return None elif msg_list: self.remove(msg_list) return msg_list # def state_test(self,agent_states): """ Check BB for states that match up with agents trigger states. """ for message in self.bb: if message.get('info') in agent_states: return message else: return None def post(self,msg_list): """ Post list of requests to BB. """ if __debug__: print "\tPosting:", for post in msg_list: print post.keys(), print post['to_name'] for message in msg_list: self.bb.append(message) def remove(self,msg_list): if __debug__: print "\tRemoving:", for post in msg_list: print post.keys(), print post['to_name'] for message in msg_list: if message in self.bb: self.bb.remove(message) if __debug__: def show(self,name): print "<-BB-"+name+"->", for post in self.bb: print post.keys(), print post['to_name'],'
' print '

' NetSerf/bot.py100644 764 764 27133 6544252016 11365 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bb import * import shelve SHELF_DIR = '/home/httpd/cgi-bin/spider/' #SHELF_DIR = './' class Agent: """ The Agent class is only to be used as a superclass for the individual agents. It hides all the internal pipeworks of the agent, simplifying the creating of agent objects. Subclasses of the Agent class don't have any public methods, they are only called directly (via __call__). And the menski agent should handle all this (the initialization and calling of the agents). """ __type__ = 'AO' ## Pass in the messages when calling the agent (__call__). ## This would provide a simpler interface for making menski ## bots and normal bots indetical when spawned off from a menski ## bot. It would also simply things more in general. def __init__(self,bb): # Menski's BlackBoard self.bb = bb # dict of subgoals(agent name):state_with_subgoals_met(a string) self.req_subgoals = [] # List of states (strings) to act upon self.act_states = [] # Name of agent type self.name = self.__class__.__name__ # Agent id # (individual name) self.id = id(self) # Agents memory shelve_name = SHELF_DIR + '%s.shelf' % self.name self.memory = shelve.open(shelve_name) #if hasattr(self,'set_goals'): self.set_goals() # def __call__(self): """ Check BB for new goals and agent communication. If there is a match, act on it. Action: (apply call) Polymorphic call for each agent. It just looks up that agent's real method and calls it. This is so that a complex agent could inherit from multiple, more primitive, agents. """ # info will be dict of info for app (like func args) self.info = self._match_test() if self.info: apply(getattr(self,self.name)) # def _match_test(self): """ Test to see if this agent can do something with current info from BB """ bb = self.bb # check for request/query # check for pre_reqs # if pre_reqs, do action (& unrequest pre_reqs, eg. disconnect telnet) # in no request, check for state match # if state match, do action message = bb.message_test(self) if message: if message.has_key('answer'): return message if self.req_subgoals: subgoals = self._subgoals_not_met(message) if subgoals: for subgoal in subgoals: self.send_request(subgoal) return None else: for subgoal in self.req_subgoals: del self.memory[subgoal] else: return message else: # if no state match, returns None return bb.state_test(self.act_states) # receives -> required posts [optional posts] # states -> [states or requests or informs] # requests -> answers [and states or informs] # answers -> [states or requests or informs] def _subgoals_not_met(self,message): subgoals = [] prereq ={} for subgoal in self.req_subgoals: #if subgoal in self.memory.keys(): continue #self.memory[subgoal] = 1 prereq = { 'to_name':subgoal, 'from_id':self.id, 'from_name':self.name } # in case info for this request is needed for subgoal message.update(prereq) subgoals.append(request) # return if __debug__: print "\tSubgoal: ", subgoals return subgoals # Set goals... # This should be overriden def set_goals(self): pass # Post an inform def send_inform(self,target_name=None,msg_dict={}): """ Send an informative message to target Usage: self.send_inform(target_name,message_content) """ message = { 'inform':None, 'from_id':self.id, 'from_name':self.name, 'to_name':target_name } # informs are sent to names, only names of agents # on the nodes up the tree (not down another branch) message.update(msg_dict) if not message['to_name']: raise 'Message Sending error','no target given' self.bb.post([message]) # Post a request def send_request(self,target_name=None,msg_dict={}): """ Send a request to target Usage: self.send_request(target_name,message_content) """ message = { 'request':None, 'from_id':self.id, 'from_name':self.name, 'to_name':target_name } # requests are sent to names, only names of agents # on the nodes up the tree (not down another branch) message.update(msg_dict) if not message['to_name']: raise 'Message Sending error','no target given' self.bb.post([message]) # Post an answer to a request def send_answer(self,msg_dict): """ Send an answer to target Usage: self.send_answer(target_name,message_content) """ post_format = { 'answer':None, 'from_id':self.id, 'from_name':self.name, 'to_name':self.info['from_name'], 'to_id':self.info['from_id'] } msg_dict.update(post_format) self.bb.post([msg_dict]) # Post state info def post_state(self,state_dict): """ Post a state to the BB Usage: self.post_state(info_dict) """ post_format = { 'state':None, 'from_id':self.id, 'from_name':self.name } state_dict.update(post_format) self.bb.post([state_dict]) # Post a reply, either answer or state, whichever is appropriate def post_reply(self,msg_dict): """ Post either a state or an answer to the BB. Post an answer if this is in reply to a request, post a state if it was triggered by a state. Usage: self.post_reply(message_content) """ if self.info.has_key('request'): self.send_answer(msg_dict) else: self.post_state(msg_dict) # Menski agent class Menski(Agent): __type__ = 'Menski' # Can't have additional initialization arguments here # Must be identical to AO or it won't be transparent (won't encapsulate) #def __init__(self,agent_classes=[]): def __init__(self,pbb=BlackBoard()): # Menski's BlackBoard self.bb = BlackBoard() #if not pbb: # pbb = self.bb self.pbb = pbb Agent.__init__(self,self.bb) self.ids = {self.id:self.name} self.names = {self.name:self.id} # Hack, until I get the menski agent shelf built to store this info self.archtypes = eval(self.name + '_agent_classes') # Hack until threading is implemented # This gets set in _send_msg, _accept_msg, and _handle_collective # gets reset in __call__, gets checked in loop self.active = 0 # Initialize BB self.collective = [] if self.archtypes: self._spawn() if __debug__: print self.name,"Ids:", self.ids print self.name,"Names:", self.names # def _spawn(self,additional_classes=[]): """ Create the collective (ie. the agents managed). """ # additional_classes is used to add more agents to Menski # after initialization if additional_classes: self.archtypes = additional_classes # initialize collective for agent in self.archtypes: if agent.__type__ == 'Menski': #print "(Threading stuff here)" self.collective.append(agent(self.bb)) for id in self.collective[-1].ids.keys(): self.ids[id] = self.collective[-1].name #self.ids[self.collective[-1].id] = self.collective[-1].name self.names[self.collective[-1].name] = self.collective[-1].id else: self.collective.append(agent(self.bb)) self.ids[self.collective[-1].id] = self.collective[-1].name self.names[self.collective[-1].name] = self.collective[-1].id # def __call__(self): """ Pass in black board, make local BB (keeping it separate from global BB). Check for matches, run collective (may be the same). """ self.active = 0 self._accept_msgs() self._handle_collective() self._send_msgs() # def _handle_collective(self): """ Run through collective. Every member gets tested at least once after each update to the BB. OLD..... After each agent tells the collective they are done by doing None (Active agents always report something). The control is then passed on the _done(). """ # Run each agent in the collective for agent in self.collective: if __debug__: print "--", agent.name, agent.id, "--" agent() # Part of non-threading Hack if agent.__type__ == "Menski": if agent.active: self.active = 1 if __debug__: self.bb.show(self.name) # Should the menski agent have the chance to react to a state # this would mean an entry in the menski agents act_states attribute # it would work similarly to the request, just triggering a subgoal def _accept_msgs(self): pbb = self.pbb post_list = [] post_list_rm = [] if pbb: msg_list = pbb.message_test(self) for message in msg_list: if (message.has_key('request') and message['to_name'] ==self.name): post_list_rm.append(message.copy()) post_list.append(self._proxy(message)) elif (message.has_key('inform') and message['to_name'] ==self.name): post_list_rm.append(message.copy()) post_list.append(self._proxy(message)) elif (message.has_key('answer') and message['to_id'] in self.ids.keys()): post_list_rm.append(message) post_list.append(message) if post_list: pbb.remove(post_list_rm) self.bb.post(post_list) # Non-Threading Hack self.active = 1 # def _send_msgs(self): pbb = self.pbb post_list = [] for message in self.bb: if (message.has_key('request') and not self.names.has_key(message['to_name'])): post_list.append(message) elif (message.has_key('inform') and not self.names.has_key(message['to_name'])): post_list.append(message) elif (message.has_key('answer') and not self.ids.has_key(message['to_id'])): message['from_name'] = self.name post_list.append(message) # Non-Threading Hack self.active = 1 if post_list: if __debug__: print self.name, "sends Message:", for post in post_list: print post.keys(), print post['to_name'] pbb.post(post_list) self.bb.remove(post_list) # def _proxy(self,message): for subgoal in self.req_subgoals: message['to_name'] = subgoal return message # Loop Hack for non-threaded version def loop(self): cnt = 0 while 1: self.__call__() if not self.active: break else: cnt = cnt + 1 if __debug__: print "Loop: %d\n" % cnt ############### ## Testing stuff ## class Hello_World(Menski): def set_goals(self): self.req_subgoals.append('say_hello') class Top(Menski): pass class say_hello(Agent): """ Basic testing agent. """ def set_goals(self): self.act_states.append('start') def say_hello(self): post = { 'info':'hi' } self.post_state(post) self.bb.remove([self.info]) print "***Hello***" class say_there(Agent): """ Basic testing agent. """ def set_goals(self): self.act_states.append('hi') def say_there(self): post = { 'info':'there' } self.post_state(post) self.bb.remove([self.info]) print "***There***" class say_world(Agent): """ Basic testing agent. """ def set_goals(self): self.act_states.append('there') def say_world(self): self.bb.remove([self.info]) print "***World***" if __name__ == "__main__": Hello_World_agent_classes = [say_hello,say_there,say_world] Top_agent_classes = [Hello_World] top = Top() top.send_request('Hello_World',{}) top() NetSerf/display.py100644 764 764 4656 6544252016 12233 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import Agent class display(Agent): def display(self): topic = self.info['topic'] titles = self.info['titles'] scores = self.info['scores'] # print "Content-type: text/html" # print # print "" # print "

" print "" print "" page_list = [] for url,score in scores.items(): page_list.append((score,url)) page_list.sort() page_list.reverse() for score,url in page_list: title = titles[url] if title == None: title = url if title == "Not Found": print "" % url continue if url == 'primer': print "" % url continue print "" % (url,title,score) print "" % (url,) print "
SiteScore
" print "%s
[%s]
" print " [Visited,Bad]" % (url,) print " [Not Visited]" % (url,) print " [Visited]" % (url,) print " [Save]" % (url,) print " [Watch]" % (url,) print " [Save & Watch]
" # print "" # print "
" NetSerf/extractor.py100644 764 764 14724 6544252016 12616 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import Agent from math import sqrt import string,re class extractor(Agent): """ Agent that extracts the keywords and relations from the html parsed information. """ def extractor(self): topic = self.info['topic'] url = self.info['url'] title = self.info['title'] prim_headings = self.info['prim_headings'] sec_headings = self.info['sec_headings'] meta_keywords = self.info['keywords'] page = self.info['page'] # FIX - existing keywords from the snet snet_keywords = {} # Keywords (for snet) keywords = {} keywords.update(snet_keywords) # Compile regular expressions for munging document's text # Split up document into blocks based on this (for near) splitpat = re.compile(r'<(?:p(?=>)|tr|ol|ul|dt).*?>',re.I) # Delete html markup (including comments for javascript) delpat = re.compile(r'|<.*?>|&.*?;| ',re.S) # Clear out excess spaces spcpat = re.compile(r'^\s+|\s\s+') # Clear out other unwanted characters (tab,newline,return,],[,{,},\,/,-) replpat = re.compile(r'[\t\n\r(?:\][)\-(?:\s[|\]\s)._&,!~\*\?;\'\":}{\\/]+') # Split into blocks htmlblocks = re.split(splitpat,page) # Working variables keyword_cnt = {} # Munge blocks, build list of blocks, and create keyword count, # near_set and next_set wordblock_list = [] for block in htmlblocks: block = delpat.sub('',block) block = replpat.sub(' ',block) block = spcpat.sub('',block) if block: wordlist = map(string.lower,string.split(block)) clean_wordlist = [] for word in wordlist: if len(word) < 4: continue if keyword_cnt.has_key(word): keyword_cnt[word] = keyword_cnt[word] + 1.0 else: keyword_cnt[word] = 1.0 clean_wordlist.append(word) wordblock_list.append(clean_wordlist) # Make sure headings and title are properly formatted if prim_headings: prim_headings = string.split(string.lower(replpat.sub(' ',(string.join(prim_headings))))) if sec_headings: sec_headings = string.split(string.lower(replpat.sub(' ',(string.join(sec_headings))))) if meta_keywords: mkw_list = string.split(replpat.sub(' ',(meta_keywords))) else: mkw_list = () if title: title_list = string.split(replpat.sub(' ',(string.lower(title)))) else: title_list = () # Working variables sum = high = var = tsum = 0.0 low = 1000 # Tallie occurances, finds sum, squred sum, low (min) and high (max) kw_cnt_vals = keyword_cnt.values() for val in kw_cnt_vals: sum = sum + val high = max(val,high) low = min(val,low) # mean & standard deviation of number of occurances # of the words in the document #mean = sum / float(len(keyword_cnt)) for val in kw_cnt_vals[:]: #tval = val - mean #var = var + (tval * tval) if val == low or val == high: kw_cnt_vals.remove(val) else: tsum = tsum + val # stddev = sqrt(var / float(len(keyword_cnt) - 1)) if not tsum or not len(kw_cnt_vals): trim_mean = 2.0 else: trim_mean = tsum / float(len(kw_cnt_vals)) stat = trim_mean # Determine scores of words for word,cnt in keyword_cnt.items(): if cnt > stat: keywords[word] = (cnt - stat) / (high - stat) # Beginning to think that this does not pay off ## elif cnt < stat: ## tmp_val = (stat - cnt) / (stat - low) ## if tmp_val > 0.5 and tmp_val < 1.0: ## keywords[word] = 0.3 ## else: ## keywords[word] = 0.1 else: keywords[word] = 0.1 if len(word) > 6: keywords[word] = sqrt(keywords[word]) if word in sec_headings: keywords[word] = sqrt(keywords[word]) if word in prim_headings: keywords[word] = 1.0 if word in title_list: keywords[word] = 1.0 if word in mkw_list: keywords[word] = 1.0 for word,val in keywords.items(): # if not snet_keywords.has_key(word): if val < 0.54: del keywords[word] # +"tour de france" +98 near_list = [] near_d = {} next_d = {} for wordlist in wordblock_list: prev_word = '' for word in wordlist: if keywords.has_key(word): near_list.append(word) if prev_word: t_set = (prev_word,word) if not next_d.has_key(t_set): next_d[t_set] = None prev_word = word else: prev_word = '' for word in near_list[:]: near_list.remove(word) for near_word in near_list: l_set = [word,near_word] l_set.sort() t_set = tuple(l_set) if not near_d.has_key(t_set): near_d[t_set] = None # Set of words next to other words (for snet) next_set = tuple(next_d.keys()) # Set of words near other words (for snet) near_set = tuple(near_d.keys()) ## kw_set = kjSet(keywords.keys()) ## for set in near_set.items(): ## if len(set) != 2: ## del near_set[set] ## elif not set.subset(kw_set): ## del near_set[set] ## for set in next_set.items(): ## if len(set) != 2: ## del next_set[set] ## elif not set.subset(kw_set): ## del next_set[set] connectors = { 'next_to':next_set, 'near':near_set } # TODO # Figure in number of times keyword appears in other snets message = { 'topic':topic, 'url':url, 'title':title, 'keywords':keywords, 'connectors':connectors } self.send_inform('Profiler',message) ## print keywords ## print "****************************************************" ## print message ## print "keywords:",len(keywords) ## print "****************************************************" ## print "near_set:",len(near_set) ## print "****************************************************" ## print "next_set:",len(next_set),next_set ## print "****************************************************" ## print "stddev:",stddev,"trim_mean:",trim_mean,"min:",low,"max:",high NetSerf/fetcher.py100644 764 764 2460 6544252016 12175 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import * import socket def Retriever(bb): asyncore = {} execfile('lib/asyncore.py',asyncore) class Retriever (Agent, asyncore.dispatcher_with_send): def __init__(self,bb): # init dispatcher asyncore.dispatcher_with_send.__init__ (self) # get socket self.create_socket(socket.AF_INET, socket.SOCK_STREAM) # init Agent Agent.__init__(self,bb) def __call__(self): self.connect((host,80)) while asyncore.socket_map: asyncore.poll() NetSerf/html_parser.py100644 764 764 13654 6544252016 13124 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import string import htmllib from bot import Agent from urlparse import * from formatter import NullFormatter ############################################################################ # myHTMLParser class is derived from htmllib.HTMLParse but to handle images # authors, etc. # Blatently stolen from linbot by Marduk with some modifications by me # to get keywords and headings # Marduk's home page ############################################################################ class MyHTMLParser(htmllib.HTMLParser): def __init__(self,formatter): self.imagelist = [] self.pheadings = [] self.sheadings = [] self.anchor_names = [] self.keywords = '' self.title = '' self.author = None self.base = None htmllib.HTMLParser.__init__(self,formatter) # override handle_image() def handle_image(self,src,alt,*stuff): if src not in self.imagelist: self.imagelist.append(src) def do_frame(self,attrs): for name, val in attrs: if name=="src": self.anchorlist.append(val) def save_bgn(self): self.savedata = '' def save_end(self): data = self.savedata self.savedata = None return data ## def start_title(self, attrs): ## self.save_bgn() ## def end_title(self): ## if not self.savedata: ## self.title = None ## return ## self.title = string.join(string.split(self.save_end())) def start_h1(self,attrs): self.save_bgn() def end_h1(self): if not self.savedata: return self.pheadings = self.pheadings + string.split(self.save_end()) def start_h2(self,attrs): self.save_bgn() def end_h2(self): if not self.savedata: return self.pheadings = self.pheadings + string.split(self.save_end()) def start_h3(self,attrs): self.save_bgn() def end_h3(self): if not self.savedata: return self.sheadings = self.sheadings + string.split(self.save_end()) def start_h4(self,attrs): self.save_bgn() def end_h4(self): if not self.savedata: return self.sheadings = self.sheadings + string.split(self.save_end()) def do_meta(self,attrs): fields={} for name, value in attrs: fields[name]=value if fields.has_key('name'): if string.lower(fields['name']) == 'keywords': if fields.has_key('content'): self.keywords = string.lower(fields['content']) elif string.lower(fields['name']) == 'author': if fields.has_key('content'): self.author = fields['content'] # for client-side image maps def do_area(self,attrs): for name, val in attrs: if name=="href": if val not in self.anchorlist: self.anchorlist.append(val) def do_base(self,attrs): for name,val in attrs: if name=="href": self.base = val def urlformat(url,parent=None): """ returns a formatted version of URL, which, adds trailing '/'s, if necessary, deletes fragmentation identifiers '#' and expands partial url's based on parent """ # from urlparse import * method=urlparse(url)[0] if (method=='') and (parent != None): url=urljoin(parent,url) # url=basejoin(parent,url) parsedlist = list(urlparse(url)) parsedlist[5]='' # remove fragment # parsedlist[4]='' # remove query string if string.rfind(parsedlist[2],'index.') != -1: parsedlist[2] = parsedlist[2][:string.rfind(parsedlist[2],'index.')] url = urlunparse(tuple(parsedlist)) return url class html_parser(Agent): """ HtmlParser agent. Parses out html, returning a message containing a list of links, a list of images, the author, the title, the keywords and a list of blocks of text (striped of html markup). """ def html_parser(self): """ returns a list of all the url's in a page. page should be a file object Partial urls will be expanded using parameter unless the page contains the tag.""" # Full URL url = self.info['url'] # Full text of docuement page = self.info['page'] # Users target topic topic = self.info['topic'] parser = MyHTMLParser(NullFormatter()) parser.feed(page) urllist = [] imagelist = [] title = parser.title author = parser.author pheadings = parser.pheadings sheadings = parser.sheadings keywords = parser.keywords # Set parent url, used to build up relative url paths if parser.base is not None: parent = parser.base else: parent = url # url's contained on page for anchor in parser.anchorlist: anchor=urlformat(anchor,parent) if anchor not in urllist: urllist.append(anchor) # url's for images on page for image in parser.imagelist: image=urlformat(image,parent) if image not in imagelist: imagelist.append(image) # reply consists of: # url -> url of page # title -> title of page # author -> author of page # headings ->

-

words (list) # keywords -> from (list) # urllist -> url's on page (list) # imagelist -> url's of images on page (list) # page -> web page message = { 'topic':topic, 'url':url, 'title':title, 'author':author, 'prim_headings':pheadings, 'sec_headings':sheadings, 'keywords':keywords, 'urllist':urllist, 'imagelist':imagelist, 'page':page } ## print 'Parser:',url,'

' self.send_inform('extractor',message) NetSerf/myRequests.py100644 764 764 4522 6544252016 12737 0ustar jaejae from requests import * import string REQUEST_TOP = ( "%(METHOD)s %(URI)s HTTP/1.0\r\n" "User-Agent: %(AGENT)s\r\n" "Accept: */*\r\n" ) SE_REQUEST_TOP = REQUEST_TOP + ( "Cookie: %(COOKIE)s\r\n" ) SIMPLE_REQUEST = REQUEST_TOP + REQUEST_END SEARCH_REQUEST = SE_REQUEST_TOP + REQUEST_END def Search_Request(uri = "/index.html", cookie = "", agent = "Mozilla/3.0 (X11; I; Linux 2.0.33 i586)"): Dict = { "METHOD":"GET", "URI":uri, "AGENT":agent, "COOKIE":cookie, "BODY": "" } if cookie: return SEARCH_REQUEST % Dict else: return SIMPLE_REQUEST % Dict def Search_Query(url,Dict): query = urlencode_search_data(Dict) if query: uri = "%s?%s" % (url,query) else: uri = url return Search_Request(uri) def urlencode_search_data(Dict): """Encode dictionary as an url encoded query string. Dict should contain name-->value, where value is a list only when the name should have multiple values. """ from urllib import quote,quote_plus from types import ListType pairs = [] for (name, values) in Dict.items(): ename = quote(str(name)) if type(values) != ListType: evalue = quote(str(values)) pairs.append( "%s=%s" % (ename, evalue) ) else: tpairs = [] for value in values: evalue = quote(str(value)) #print evalue tpairs.append( "%s" % (evalue,) ) vals = quote_plus(string.join(tpairs),'%2b') vals = "%s=" % (ename,) + vals pairs.append(vals) return string.join(pairs, "&") class GETSearch(GETRequest): """ For use with search engines (Infoseek,Altavista), which require a somewhat more sophistocated method for formatting the queries. Like the fact that +'s occur in the query and have to be handled specially (so they aren't interpreted as spaces). """ def Request_Format(self): return Search_Query(self.uri, self.Dict) def test(machine=None, uri=None, Dict=None, port=HTTPPORT): """ Poke a webcrawler using search request format. """ if machine is None: machine = "www.webcrawler.com" if uri is None: uri = "/cgi-bin/WebQuery" if Dict is None: Dict = { 'searchText':'+python -monty', 'showSummary':'true', 'start':'0', 'perPage':'100' } print machine, uri, Dict request1 = GETSearch(machine, uri, Dict, port) Looper = Read_Event_Loop() Looper.verbose = 1 Looper.start_item(request1) Looper.go_til_done() if __name__=="__main__": test() NetSerf/reporter.py100644 764 764 7205 6544252016 12421 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import Agent,SHELF_DIR from snet import Snet,NET_SHELF from whrandom import random import shelve class reporter(Agent): """ Receives info from url-fether (urllist), display-agent (scores) or snet-builder (new-snets). Processes this info into results to display to user, and new scores to update snet with. """ def reporter(self): # Needs to be initialized for test below new_scores = {} # Get message info from either the url-fetcher (if), # display-agent (elif), or the snet-builder (else) topic = self.info['topic'] if self.memory.has_key(topic): mem = self.memory[topic] else: mem = {} if self.info.has_key('urllist'): urllist = self.info['urllist'] mem['urllist'] = urllist elif self.info.has_key('scores'): new_scores = self.info['scores'] # dict else: url = self.info['url'] title = self.info['title'] net = self.info['snet'] mem[url] = (title,net) if mem.has_key('urllist'): urllist = mem['urllist'] else: self.memory[topic] = mem return # Check to make sure all urls have been parsed before # getting user feedback on them ## print 'reporter-urllist:',urllist,'
' for url in urllist: # Not needed any more since the urllist is cleaned before sent if mem.has_key(url): ## print 'reporter-GOTIT-url:',url,'

' if not mem[url]: mem['urllist'].remove(url) del mem[url] else: ## print 'reporter-nope-url:',url,'

' ## self.bb.show(self.name) self.memory[topic] = mem return self.memory[topic] = mem # Not needed from this point on del mem['urllist'] # Score pages to determine which to show user # Get shelf of nets net_shelf = shelve.open(NET_SHELF) if net_shelf.has_key(topic): net = net_shelf[topic] else: net = '' # Temporary hack until user feedback interface is done # basically here so I can test the rest of the agent #for url in urllist: # new_scores[url] = random() # Store scores and nets for message to display-agent titles = {} if not new_scores: scores = {} for url,(title,mem_net) in mem.items(): #if url == 'urllist': continue if net: score = net.analyze(mem_net) # if score > 0.1: scores[url] = score titles[url] = title else: scores[url] = 'Not scored' titles[url] = title # Message to display agent message = { 'topic':topic, 'titles':titles, 'scores':scores } addressee = 'display' else: nets = [] for url,(title,mem_net) in mem.items(): if not new_scores.has_key(url): continue if new_scores[url] == '0.0': continue nets.append((new_scores[url],mem_net)) # Send info to snet-maintainer to update the snet message = { 'topic':topic, 'nets':nets } addressee = 'snet_maintainer' del self.memory[topic] self.send_inform(addressee,message) NetSerf/requests.py100644 764 764 16436 6544252016 12460 0ustar jaejae """Tools for making http requests.""" HTTPPORT = 80 BLOCKSIZE = 8000 CRLF = "\r\n" REQUEST_TOP = ( "%(METHOD)s %(URI)s HTTP/1.0\r\n" "User-Agent: %(AGENT)s\r\n" "Accept: */*\r\n" ) CT_REQUEST_TOP = REQUEST_TOP + ( "Content-type: %(CONTENT_TYPE)s\r\n" "Content-length: %(CONTENT_LENGTH)s\r\n" ) REQUEST_END = ( "\r\n" \ "%(BODY)s" ) SIMPLE_REQUEST = REQUEST_TOP + REQUEST_END CT_REQUEST = CT_REQUEST_TOP + REQUEST_END import string def Simple_Request(uri = "/index.html", agent = "test_robot"): Dict = {"METHOD":"GET", "URI":uri, "AGENT":agent, "BODY": ""} return SIMPLE_REQUEST % Dict def urlencode_data(Dict): """Encode dictionary as an url encoded query string. Dict should contain name-->value, where value is a list only when the name should have multiple values. """ from urllib import quote from types import ListType pairs = [] for (name, values) in Dict.items(): ename = quote(str(name)) if type(values) != ListType: values = [values] for value in values: evalue = quote(str(value)) pairs.append( "%s=%s" % (ename, evalue) ) return string.joinfields(pairs, "&") def Get_Query(url, Dict): query = urlencode_data(Dict) uri = "%s?%s" % (url, query) return Simple_Request(uri) def ANY_Post_Query(url, query, content_type): Sub = {} Sub["URI"] = url Sub["METHOD"] = "POST" Sub["AGENT"] = "the-dreaded-robot-poster" Sub["CONTENT_TYPE"] = content_type Sub["CONTENT_LENGTH"] = len(query) Sub["BODY"] = query return CT_REQUEST % Sub def Post_Query(url, Dict): query = urlencode_data(Dict) return ANY_Post_Query(url, query, "application/x-www-form-urlencoded") SEGMENT = ("Content-type: text/plain\r\n" 'Content-disposition: form-data; name="%s"\r\n' "\r\n" "%s") def Multi_Post_Query(url, Dict): from mimetools import choose_boundary from urllib import quote from types import ListType boundary = choose_boundary() middle_boundary = "--" + boundary end_boundary = middle_boundary+"--" body = [] for (name, values) in Dict.items(): ename = quote(str(name)) if type(values)!=ListType: values = [values] for value in values: body.append(middle_boundary) body.append(SEGMENT % (ename, value)) body = string.joinfields(body, CRLF) content_type = "multipart/form-data; boundary=%s" % boundary return ANY_Post_Query(url, body, content_type) class GETRequest: blocksize = BLOCKSIZE # overrideable for testing purposes def __init__(self, machine, uri, queryDict, port=HTTPPORT): self.machine = machine self.uri = uri self.Dict = queryDict self.port = port self.sock = None def __repr__(self): return self.__class__.__name__ + \ `(self.machine, self.uri, self.Dict, self.port)` def Request_Format(self): return Get_Query(self.uri, self.Dict) def connect(self): from socket import socket, AF_INET, SOCK_STREAM sock = self.sock = socket(AF_INET, SOCK_STREAM) sock.connect(self.machine, self.port) return sock def send_request(self, sock): request = self.Request_Format() sock.send(request) def get_reply(self, sock): reply = [] while 1: data = sock.recv(self.blocksize) if not data: break reply.append(data) sock.close() return string.joinfields(reply, "") def retrieve(self): sock = self.connect() self.send_request(sock) return self.get_reply(sock) def start(self): """For use with a non-blocking read event loop.""" self.sock = sock = self.connect() self.send_request(sock) self.data_read = [] def try_read(self): """For use with a non-blocking read event loop. returns true when data is done.""" from select import select # make sure self is readable (test, dummy1, dummy2) = select([self.sock], [], [], 0) for sock in test: data = sock.recv(self.blocksize) if not data: return 1 self.data_read.append(data) return 0 def when_done_reading(self): """For use with a nonblocking read event loop. default implementation just prints the read data. If this is used, it will usually need to be overridden. """ data = string.joinfields(self.data_read, "") print data def selectable(self): """the selectable object in self.""" return self.sock class POSTRequest(GETRequest): def Request_Format(self): return Post_Query(self.uri, self.Dict) class Multi_Post_Request(GETRequest): def Request_Format(self): return Multi_Post_Query(self.uri, self.Dict) class Any_Request(GETRequest, POSTRequest, Multi_Post_Request): def __init__(self, behaviour, machine, uri, queryDict, port=HTTPPORT): """behaviour should be one of the superclasses.""" self.behaviour = behaviour GETRequest.__init__(self, machine, uri, queryDict, port) def Request_Format(self): return self.behaviour.Request_Format(self) class Read_Event_Loop: verbose = 0 # override this to get verbose tracing... def __init__(self, timeout=1, *readables): self.timeout = timeout self.init_list = readables self.started = {} def start_all(self): init_list = self.init_list self.init_list = () for readable in init_list: self.start_item(readable) def start_item(self, readable): if self.verbose: print "starting", readable readable.start() self.started[ readable.selectable() ] = readable def try_all_readables(self): from select import select started = self.started selectables = started.keys() if self.verbose: print "selecting", len(selectables) (readavails, dummy1, dummy2) = \ select(selectables, [], [], self.timeout) for readavail in readavails: readable = started[readavail] if self.verbose: print "reading", readable test = readable.try_read() if test: if self.verbose: print "terminating", readable readable.when_done_reading() del started[readavail] def all_done(self): return len(self.started) == 0 def go_til_done(self): while not self.all_done(): self.try_all_readables() def test(machine=None, uri=None, Dict=None, port=HTTPPORT): """Poke a cgi program using all 3 request formats. The cgi program may not recognise all request formats... """ if machine is None: machine = "www.att.com" if uri is None: uri = "/cgi-bin/ATT_WEB/search" if Dict is None: Dict = {"all":"all", "keywords": "Internet services", "weight": "weight", "limit": 2} print machine, uri, Dict request1 = GETRequest(machine, uri, Dict, port) request2 = POSTRequest(machine, uri, Dict, port) request3 = Multi_Post_Request(machine, uri, Dict, port) Looper = Read_Event_Loop() Looper.verbose = 1 Looper.start_item(request1) Looper.start_item(request2) Looper.start_item(request3) Looper.go_til_done() if __name__=="__main__": test() NetSerf/snet.py100644 764 764 20347 6544252016 11552 0ustar jaejae# # Python Agents - Semantic Keyword Network Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from math import * from bot import SHELF_DIR import shelve,string NET_SHELF = SHELF_DIR + 'snet.shelf' FREQ_SHELF = SHELF_DIR + 'snet_frequency.shelf' class Snet: """ Semantic keyword network. Responsible for both building the networks and using it to anyalyze some text. """ def __init__(self): """ Semantic keyword network. """ self.keywords = {} self.relations = {'next_to':{},'near':{}} self.usable = 0 self.age = 0.0 self.epic = 30.0 def __nonzero__(self): return self.usable def analyze(self,snet): """ Takes a new snet representing a url, returns score for that url. """ rel_score = {} numof_rscores = {} kw_score = 0 numof_kscores = 0 score_list = [] # To figure in how frequent a word/set is used by counting # the number of other snets it occurs in freq_shelf = shelve.open(FREQ_SHELF) kw_freq = freq_shelf['keywords'] set_freq = freq_shelf['sets'] # Get total score of all relations for type, relation in snet.relations.items(): # intersection using tuples current_rels = self.relations[type] #test_rels = tuple(relation.keys()) # Replaced kjSets (switch back?) common_rels = [] for new_rel in relation.keys(): if current_rels.has_key(new_rel): common_rels.append(new_rel) numof_rscores[type] = len(common_rels) rel_score[type] = 0 for set in common_rels: rel_score[type] = rel_score[type] + \ (self.relations[type][set] / set_freq[type][set]) # Get total score of all keywords for keyword in snet.keywords.keys(): if self.keywords.has_key(keyword): kw_score = kw_score + (self.keywords[keyword] / kw_freq[keyword]) numof_kscores = numof_kscores + 1 # Store keyword frequency shelve freq_shelf[FREQ_SHELF] = kw_freq freq_shelf.close() # Average/Normalize scores for type in snet.relations.keys(): if rel_score[type] and numof_rscores[type]: score_list.append(rel_score[type] / numof_rscores[type]) if kw_score and numof_kscores: score_list.append(kw_score / numof_kscores) cum_score = reduce(lambda x,y: x+y, score_list, 0) if cum_score and score_list: score = cum_score / float(len(score_list)) else: score = 0 return score ## if score > 0: ## is_pos = 1 ## else: ## is_pos = 0 ## ## if is_pos: ## return score ## else: ## return 0.0 # def build(self,keywords={},connectors={'next_to':(),'near':()}): """ Takes keywords, connectors and user score, then builds a semantic network for them. keywords = {keyword:score,...} connectors = {'type':con_set,...} where con_set is -> kjSet([kjSet([keyword,keyword]),...]) score = float (user feedback) """ # Build keyword scoring dict #for word in keywords.keys(): # self.keywords[word] = 0 freq_shelf = shelve.open(FREQ_SHELF) if not freq_shelf.has_key('keywords'): freq_shelf['keywords'] = {} set_freq = {'next_to':{},'near':{}} freq_shelf['sets'] = set_freq freq_shelf.close() # Attempt at using count scores to weigh kw scores self.keywords = keywords # Build new net of relation using user's feedback as score for type, relations in connectors.items(): dict = {} for set in relations: ## for fw,sw in relations: ## fws = keywords[fw] ## sws = keywords[sw] ## score = (fws + sws) / 2 dict[set] = 0 if self.relations.has_key(type): self.relations[type] = dict else: raise 'Builder Error:','type of connector not in network' # def update(self,new_snet,new_score): """ Updates the semantic network with a new snets. """ self.usable = 1 new_score = string.atof(new_score) # freq_shelf = shelve.open(FREQ_SHELF) kw_freq = freq_shelf['keywords'] set_freq = freq_shelf['sets'] age = self.age / self.epic self.age = self.age + 1 if age < 0.2: age = 0.2 elif age == 1.0: if self.epic > 400: self.age = 220 else: self.epic = self.epic * 2.0 if __debug__: print "age->",age new_snet.show() set_dellist = [] kw_dellist = [] # Calculate new scores of relations for type, relations in new_snet.relations.items(): for set,rscore in relations.items(): new_set = 0 #wgt_new_score = new_score * (1.5 - rscore) if self.relations[type].has_key(set): score = self.relations[type][set] tmp_score = (score + ((new_score - score) * \ pow(pow((10*age),1.5), -age))) self.relations[type][set] = tmp_score else: self.relations[type][set] = new_score new_set = 1 if self.relations[type][set] < 0.110 \ and self.relations[type][set] > -0.110: set_dellist.append((type,set)) elif new_set: if set_freq[type].has_key(set): set_freq[type][set] = set_freq[type][set] + 0.1 else: set_freq[type][set] = 1.0 # Calculate new scores of keywords for keyword in new_snet.keywords.keys(): new_kw = 0 if self.keywords.has_key(keyword): score = self.keywords[keyword] self.keywords[keyword] = (score + ((new_score - score) * \ pow(pow((10*age),1.5), -age))) else: self.keywords[keyword] = new_score new_kw = 1 if self.keywords[keyword] < 0.110 \ and self.keywords[keyword] > -0.110: kw_dellist.append(keyword) elif new_kw: if kw_freq.has_key(keyword): kw_freq[keyword] = kw_freq[keyword] + 0.1 else: kw_freq[keyword] = 1.0 freq_shelf['keywords'] = kw_freq freq_shelf['sets'] = set_freq freq_shelf.close() # Remove insignificant keywords and sets if (age > 0.4 and age < 0.5) or (age > 0.75): for type,set in set_dellist: del self.relations[type][set] for kw in kw_dellist: del self.keywords[kw] # def get_keywords(self): """ Returns list of keywords. """ hs1 = hs2 = hs3 = 0.0 ls = 0.0 wd1 = wd2 = wd3 = '' for word,score in self.keywords.items(): if score > hs1: hs1 = score wd1 = word elif score > hs2: hs2 = score wd2 = word elif score < ls: ls = score wd3 = word # In case there are no negative scores yet elif not ls and score > hs3: hs3 = score wd3 = word top_kws = '' if wd1: top_kws = top_kws + "+%s " % wd1 if wd2: top_kws = top_kws + "+%s " % wd2 if wd3: top_kws = top_kws + "-%s " % wd3 hs = 0.0 ls = 0.0 low_set = high_set = () for set,score in self.relations['next_to'].items(): if score > hs: high_set = set elif score < ls: low_set = set if abs(ls) > hs: next_to = '-' + string.join(low_set) else: next_to = '+' + string.join(high_set) hs = 0.0 ls = 1.0 low_set = high_set = () for set,score in self.relations['near'].items(): if score > hs: high_set = set elif score < ls: low_set = set if abs(ls) > hs: near = '-' + string.join(low_set) else: near = '+' + string.join(high_set) return (top_kws,next_to,near) def html_show(self): print "

*** keywords ***

" for word,score in self.keywords.items(): print "%s -> %f
" % (word,score) for type,dict in self.relations.items(): print "
*** " + type + " ***

" for set,score in self.relations[type].items(): print "\t%s --> %f
" % (set,score) print "

" if __debug__: def show(self): for word,score in self.keywords.items(): print "%s -> %f" % (word,score) for type,dict in self.relations.items(): print "*** " + type + " ***" for set,score in self.relations[type].items(): print "\t%s --> %f" % (set,score) NetSerf/snet_builder.py100644 764 764 2546 6544252016 13241 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import Agent from snet import Snet class snet_builder(Agent): """ Builds a new snet out of keywords and connectors as given by the extractor agent. """ def snet_builder(self): topic = self.info['topic'] url = self.info['url'] title = self.info['title'] keywords = self.info['keywords'] connectors = self.info['connectors'] net = Snet() net.build(keywords,connectors) message = { 'topic':topic, 'url':url, 'title':title, 'snet':net } self.send_inform('reporter',message) NetSerf/snet_maintainer.py100644 764 764 2770 6544252016 13741 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import Agent from snet import Snet,NET_SHELF import shelve class snet_maintainer(Agent): def snet_maintainer(self): topic = self.info['topic'] new_nets = self.info['nets'] #scores = self.info['scores'] # Get from shelve net_shelf = shelve.open(NET_SHELF) if net_shelf.has_key(topic): net = net_shelf[topic] else: net = Snet() new_nets.sort() for score,new_net in new_nets: net.update(new_net,score) net_shelf[topic] = net net_shelf.close() ## print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ## net.show() ## print new_nets.keys() ## print "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++" NetSerf/url_finder.py100644 764 764 11653 6544252016 12732 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import re,shelve,string from bot import Agent from snet import Snet,NET_SHELF from myRequests import GETSearch from htmllib import HTMLParser from formatter import NullFormatter from urlparse import * def urlformat(url,parent=None): """ returns a formatted version of URL, which, adds trailing '/'s, if necessary, deletes fragmentation identifiers '#' and expands partial url's based on parent """ # from urlparse import * method=urlparse(url)[0] if (method=='') and (parent != None): url=urljoin(parent,url) # url=basejoin(parent,url) parsedlist = list(urlparse(url)) parsedlist[5]='' # remove fragment # parsedlist[4]='' # remove query string if string.rfind(parsedlist[2],'index.') != -1: parsedlist[2] = parsedlist[2][:string.rfind(parsedlist[2],'index.')] url = urlunparse(tuple(parsedlist)) return url class url_finder(Agent): """ Builds a set of keywords to pass to a search engine. Get's the page returned by the search engine and passes it onto the html parsing agent for link extraction. """ def url_finder(self): topic = self.info['topic'] if self.info.has_key('uri'): uri = self.info['uri'] else: uri = '' if self.info.has_key('engine'): engine = self.info['engine'] elif self.info.has_key('machine'): engine = self.info['machine'] else: engine = 'Webcrawler' if self.info.has_key('keywords'): keywords = self.info['keywords'] else: keywords = '' net_shelf = shelve.open(NET_SHELF) if keywords: rm_pat = '-[\w]+|NOT\s+[\w]+|-".*?"|NOT\s+".*?"|AND|OR|NEAR|\+|"' msg_kws_list = string.split(re.sub(rm_pat,'',keywords)) msg_kws = {} for kw in msg_kws_list: msg_kws[kw] = 1.0 message = { 'topic':topic, 'url':'primer', 'title':None, 'keywords':msg_kws, 'connectors':{} } self.send_inform('Profiler',message) elif net_shelf.has_key(topic): net = net_shelf[topic] keywords = net.get_keywords() else: raise 'No keywords for query','' net_shelf.close() searchengines = ['www.infoseek.com','www.altavista.digital.com','www.webcrawler.com','google.stanford.edu'] if engine == 'Infoseek': port = 80 params = { 'col':'WW','qt':keywords } machine = 'www.infoseek.com' uri = '/Titles' # reassign engine for clearing unneeded urls below engine = 'infoseek|inxight' elif engine == 'Webcrawler': port = 80 params = { 'searchText':keywords, 'start':'0', 'perPage':'35' } ## params = { 'searchText':kw_list, ## 'showSummary':'true', ## 'start':'0', ## 'perPage':'25' } machine = 'www.webcrawler.com' uri = '/cgi-bin/WebQuery' # reassign engine for clearing unneeded urls below engine = 'webcrawler|excite' elif engine == 'Altavista': port = 80 params = { 'pg':'q','what':'web', 'kl':'XX','q':keywords } machine = 'www.altavista.digital.com' uri = '/cgi-bin/query' # reassign engine for clearing unneeded urls below engine = 'altavista|doubleclick|digital|thetrip' elif engine == "Google": port = 80 params = { 'num':'35', 'query':keywords } machine = 'google.stanford.edu' uri = '/search' # reassign engine for clearing unneeded urls below engine = 'altavista|excite|findmail|google|hotbot|infoseek|lycos|dejanews|yahoo|amazon' else: machine = engine params = {} port = 80 #print params Get = GETSearch(machine,uri,params,port) #print machine,Get.Request_Format() page = Get.retrieve() #print page parser = HTMLParser(NullFormatter()) parser.feed(page) urls = {} # Set parent url, used to build up relative url paths if parser.base is not None: parent = parser.base else: parent = urlunparse(('http',machine,'/','','','')) # url's contained on page if machine in searchengines: pat = '\?|&|\+|%s|=|;|mailto:' % engine pat = re.compile(pat) else: pat = re.compile('\?|&|\+|=|;|mailto:') for anchor in parser.anchorlist: anchor=urlformat(anchor,parent) if pat.search(anchor): continue urls[anchor] = None message = { 'urls':urls, 'topic':topic } #print 'urls =', urls self.send_inform('url_hound',message) NetSerf/url_hound.py100644 764 764 5100 6544252016 12546 0ustar jaejae# # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import shelve from bot import Agent,SHELF_DIR from urllib import FancyURLopener VURL_SHELF = SHELF_DIR + 'visited_urls.shelf' class MyURLopener(FancyURLopener): def __init__(self, proxies=None): FancyURLopener.__init__(self, proxies) self.addheaders = [('User-agent', 'Mozilla/3.0 (X11; I; Linux 2.0.33 i586)')] # Remove 401 error handling in favor of default handling, # because we don't want the plugin to ask for passwords. http_error_401 = FancyURLopener.http_error_default def open_ftp(self,url): return 0 class url_hound(Agent): """ Gets a web page. """ def url_hound(self): urls = self.info['urls'] topic = self.info['topic'] opener = MyURLopener() vurl_shelf = shelve.open(VURL_SHELF) if vurl_shelf.has_key(topic): vstd_urls = vurl_shelf[topic] else: vstd_urls = {} for url in urls.keys(): if vstd_urls.has_key(url): del urls[url] # Only process 5 urls, this is to keep web interface # from timing out urllist = urls.keys() urllist.reverse() urllist = urllist[:5] for url in urllist: try: page_pnt = opener.open(url) page = '' while 1: line = page_pnt.readline() if not line: break page = page + line message = { 'topic':topic, 'url':url, 'page':page } self.send_inform('Processor',message) page_pnt.close() vstd_urls[url] = None except IOError, msg: urllist.remove(url) vstd_urls[url] = None continue ## message = { 'url':url, ## 'topic':topic, ## 'page':'' } ## self.send_inform('Processor',message) message = { 'urllist':urllist, 'topic':topic } self.send_inform('Profiler',message) vurl_shelf[topic] = vstd_urls vurl_shelf.close()