NetSerf/ 40775 764 764 0 6544252204 10105 5 ustar jae jae NetSerf/serf.py 100775 764 764 16265 6544252005 11547 0 ustar jae jae #!/usr/bin/python -O # # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import cgi,shelve,sys,urlparse path = '/home/httpd/cgi-bin/spider' sys.path.append(path) from snet import Snet,NET_SHELF IMP_URL_SHELF = path + '/imp_url.shelf' # def start_page(): url_shelf = shelve.open(IMP_URL_SHELF) if url_shelf: topics = url_shelf.keys() else: topics = () print "Content-type: text/html" print print "
" print "' NetSerf/bot.py 100644 764 764 27133 6544252016 11365 0 ustar jae jae # # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bb import * import shelve SHELF_DIR = '/home/httpd/cgi-bin/spider/' #SHELF_DIR = './' class Agent: """ The Agent class is only to be used as a superclass for the individual agents. It hides all the internal pipeworks of the agent, simplifying the creating of agent objects. Subclasses of the Agent class don't have any public methods, they are only called directly (via __call__). And the menski agent should handle all this (the initialization and calling of the agents). """ __type__ = 'AO' ## Pass in the messages when calling the agent (__call__). ## This would provide a simpler interface for making menski ## bots and normal bots indetical when spawned off from a menski ## bot. It would also simply things more in general. def __init__(self,bb): # Menski's BlackBoard self.bb = bb # dict of subgoals(agent name):state_with_subgoals_met(a string) self.req_subgoals = [] # List of states (strings) to act upon self.act_states = [] # Name of agent type self.name = self.__class__.__name__ # Agent id # (individual name) self.id = id(self) # Agents memory shelve_name = SHELF_DIR + '%s.shelf' % self.name self.memory = shelve.open(shelve_name) #if hasattr(self,'set_goals'): self.set_goals() # def __call__(self): """ Check BB for new goals and agent communication. If there is a match, act on it. Action: (apply call) Polymorphic call for each agent. It just looks up that agent's real method and calls it. This is so that a complex agent could inherit from multiple, more primitive, agents. """ # info will be dict of info for app (like func args) self.info = self._match_test() if self.info: apply(getattr(self,self.name)) # def _match_test(self): """ Test to see if this agent can do something with current info from BB """ bb = self.bb # check for request/query # check for pre_reqs # if pre_reqs, do action (& unrequest pre_reqs, eg. disconnect telnet) # in no request, check for state match # if state match, do action message = bb.message_test(self) if message: if message.has_key('answer'): return message if self.req_subgoals: subgoals = self._subgoals_not_met(message) if subgoals: for subgoal in subgoals: self.send_request(subgoal) return None else: for subgoal in self.req_subgoals: del self.memory[subgoal] else: return message else: # if no state match, returns None return bb.state_test(self.act_states) # receives -> required posts [optional posts] # states -> [states or requests or informs] # requests -> answers [and states or informs] # answers -> [states or requests or informs] def _subgoals_not_met(self,message): subgoals = [] prereq ={} for subgoal in self.req_subgoals: #if subgoal in self.memory.keys(): continue #self.memory[subgoal] = 1 prereq = { 'to_name':subgoal, 'from_id':self.id, 'from_name':self.name } # in case info for this request is needed for subgoal message.update(prereq) subgoals.append(request) # return if __debug__: print "\tSubgoal: ", subgoals return subgoals # Set goals... # This should be overriden def set_goals(self): pass # Post an inform def send_inform(self,target_name=None,msg_dict={}): """ Send an informative message to target Usage: self.send_inform(target_name,message_content) """ message = { 'inform':None, 'from_id':self.id, 'from_name':self.name, 'to_name':target_name } # informs are sent to names, only names of agents # on the nodes up the tree (not down another branch) message.update(msg_dict) if not message['to_name']: raise 'Message Sending error','no target given' self.bb.post([message]) # Post a request def send_request(self,target_name=None,msg_dict={}): """ Send a request to target Usage: self.send_request(target_name,message_content) """ message = { 'request':None, 'from_id':self.id, 'from_name':self.name, 'to_name':target_name } # requests are sent to names, only names of agents # on the nodes up the tree (not down another branch) message.update(msg_dict) if not message['to_name']: raise 'Message Sending error','no target given' self.bb.post([message]) # Post an answer to a request def send_answer(self,msg_dict): """ Send an answer to target Usage: self.send_answer(target_name,message_content) """ post_format = { 'answer':None, 'from_id':self.id, 'from_name':self.name, 'to_name':self.info['from_name'], 'to_id':self.info['from_id'] } msg_dict.update(post_format) self.bb.post([msg_dict]) # Post state info def post_state(self,state_dict): """ Post a state to the BB Usage: self.post_state(info_dict) """ post_format = { 'state':None, 'from_id':self.id, 'from_name':self.name } state_dict.update(post_format) self.bb.post([state_dict]) # Post a reply, either answer or state, whichever is appropriate def post_reply(self,msg_dict): """ Post either a state or an answer to the BB. Post an answer if this is in reply to a request, post a state if it was triggered by a state. Usage: self.post_reply(message_content) """ if self.info.has_key('request'): self.send_answer(msg_dict) else: self.post_state(msg_dict) # Menski agent class Menski(Agent): __type__ = 'Menski' # Can't have additional initialization arguments here # Must be identical to AO or it won't be transparent (won't encapsulate) #def __init__(self,agent_classes=[]): def __init__(self,pbb=BlackBoard()): # Menski's BlackBoard self.bb = BlackBoard() #if not pbb: # pbb = self.bb self.pbb = pbb Agent.__init__(self,self.bb) self.ids = {self.id:self.name} self.names = {self.name:self.id} # Hack, until I get the menski agent shelf built to store this info self.archtypes = eval(self.name + '_agent_classes') # Hack until threading is implemented # This gets set in _send_msg, _accept_msg, and _handle_collective # gets reset in __call__, gets checked in loop self.active = 0 # Initialize BB self.collective = [] if self.archtypes: self._spawn() if __debug__: print self.name,"Ids:", self.ids print self.name,"Names:", self.names # def _spawn(self,additional_classes=[]): """ Create the collective (ie. the agents managed). """ # additional_classes is used to add more agents to Menski # after initialization if additional_classes: self.archtypes = additional_classes # initialize collective for agent in self.archtypes: if agent.__type__ == 'Menski': #print "(Threading stuff here)" self.collective.append(agent(self.bb)) for id in self.collective[-1].ids.keys(): self.ids[id] = self.collective[-1].name #self.ids[self.collective[-1].id] = self.collective[-1].name self.names[self.collective[-1].name] = self.collective[-1].id else: self.collective.append(agent(self.bb)) self.ids[self.collective[-1].id] = self.collective[-1].name self.names[self.collective[-1].name] = self.collective[-1].id # def __call__(self): """ Pass in black board, make local BB (keeping it separate from global BB). Check for matches, run collective (may be the same). """ self.active = 0 self._accept_msgs() self._handle_collective() self._send_msgs() # def _handle_collective(self): """ Run through collective. Every member gets tested at least once after each update to the BB. OLD..... After each agent tells the collective they are done by doing None (Active agents always report something). The control is then passed on the _done(). """ # Run each agent in the collective for agent in self.collective: if __debug__: print "--", agent.name, agent.id, "--" agent() # Part of non-threading Hack if agent.__type__ == "Menski": if agent.active: self.active = 1 if __debug__: self.bb.show(self.name) # Should the menski agent have the chance to react to a state # this would mean an entry in the menski agents act_states attribute # it would work similarly to the request, just triggering a subgoal def _accept_msgs(self): pbb = self.pbb post_list = [] post_list_rm = [] if pbb: msg_list = pbb.message_test(self) for message in msg_list: if (message.has_key('request') and message['to_name'] ==self.name): post_list_rm.append(message.copy()) post_list.append(self._proxy(message)) elif (message.has_key('inform') and message['to_name'] ==self.name): post_list_rm.append(message.copy()) post_list.append(self._proxy(message)) elif (message.has_key('answer') and message['to_id'] in self.ids.keys()): post_list_rm.append(message) post_list.append(message) if post_list: pbb.remove(post_list_rm) self.bb.post(post_list) # Non-Threading Hack self.active = 1 # def _send_msgs(self): pbb = self.pbb post_list = [] for message in self.bb: if (message.has_key('request') and not self.names.has_key(message['to_name'])): post_list.append(message) elif (message.has_key('inform') and not self.names.has_key(message['to_name'])): post_list.append(message) elif (message.has_key('answer') and not self.ids.has_key(message['to_id'])): message['from_name'] = self.name post_list.append(message) # Non-Threading Hack self.active = 1 if post_list: if __debug__: print self.name, "sends Message:", for post in post_list: print post.keys(), print post['to_name'] pbb.post(post_list) self.bb.remove(post_list) # def _proxy(self,message): for subgoal in self.req_subgoals: message['to_name'] = subgoal return message # Loop Hack for non-threaded version def loop(self): cnt = 0 while 1: self.__call__() if not self.active: break else: cnt = cnt + 1 if __debug__: print "Loop: %d\n" % cnt ############### ## Testing stuff ## class Hello_World(Menski): def set_goals(self): self.req_subgoals.append('say_hello') class Top(Menski): pass class say_hello(Agent): """ Basic testing agent. """ def set_goals(self): self.act_states.append('start') def say_hello(self): post = { 'info':'hi' } self.post_state(post) self.bb.remove([self.info]) print "***Hello***" class say_there(Agent): """ Basic testing agent. """ def set_goals(self): self.act_states.append('hi') def say_there(self): post = { 'info':'there' } self.post_state(post) self.bb.remove([self.info]) print "***There***" class say_world(Agent): """ Basic testing agent. """ def set_goals(self): self.act_states.append('there') def say_world(self): self.bb.remove([self.info]) print "***World***" if __name__ == "__main__": Hello_World_agent_classes = [say_hello,say_there,say_world] Top_agent_classes = [Hello_World] top = Top() top.send_request('Hello_World',{}) top() NetSerf/display.py 100644 764 764 4656 6544252016 12233 0 ustar jae jae # # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import Agent class display(Agent): def display(self): topic = self.info['topic'] titles = self.info['titles'] scores = self.info['scores'] # print "Content-type: text/html" # print # print "
" # print "" NetSerf/extractor.py 100644 764 764 14724 6544252016 12616 0 ustar jae jae # # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import Agent from math import sqrt import string,re class extractor(Agent): """ Agent that extracts the keywords and relations from the html parsed information. """ def extractor(self): topic = self.info['topic'] url = self.info['url'] title = self.info['title'] prim_headings = self.info['prim_headings'] sec_headings = self.info['sec_headings'] meta_keywords = self.info['keywords'] page = self.info['page'] # FIX - existing keywords from the snet snet_keywords = {} # Keywords (for snet) keywords = {} keywords.update(snet_keywords) # Compile regular expressions for munging document's text # Split up document into blocks based on this (for near) splitpat = re.compile(r'<(?:p(?=>)|tr|ol|ul|dt).*?>',re.I) # Delete html markup (including comments for javascript) delpat = re.compile(r'|<.*?>|&.*?;| ',re.S) # Clear out excess spaces spcpat = re.compile(r'^\s+|\s\s+') # Clear out other unwanted characters (tab,newline,return,],[,{,},\,/,-) replpat = re.compile(r'[\t\n\r(?:\][)\-(?:\s[|\]\s)._&,!~\*\?;\'\":}{\\/]+') # Split into blocks htmlblocks = re.split(splitpat,page) # Working variables keyword_cnt = {} # Munge blocks, build list of blocks, and create keyword count, # near_set and next_set wordblock_list = [] for block in htmlblocks: block = delpat.sub('',block) block = replpat.sub(' ',block) block = spcpat.sub('',block) if block: wordlist = map(string.lower,string.split(block)) clean_wordlist = [] for word in wordlist: if len(word) < 4: continue if keyword_cnt.has_key(word): keyword_cnt[word] = keyword_cnt[word] + 1.0 else: keyword_cnt[word] = 1.0 clean_wordlist.append(word) wordblock_list.append(clean_wordlist) # Make sure headings and title are properly formatted if prim_headings: prim_headings = string.split(string.lower(replpat.sub(' ',(string.join(prim_headings))))) if sec_headings: sec_headings = string.split(string.lower(replpat.sub(' ',(string.join(sec_headings))))) if meta_keywords: mkw_list = string.split(replpat.sub(' ',(meta_keywords))) else: mkw_list = () if title: title_list = string.split(replpat.sub(' ',(string.lower(title)))) else: title_list = () # Working variables sum = high = var = tsum = 0.0 low = 1000 # Tallie occurances, finds sum, squred sum, low (min) and high (max) kw_cnt_vals = keyword_cnt.values() for val in kw_cnt_vals: sum = sum + val high = max(val,high) low = min(val,low) # mean & standard deviation of number of occurances # of the words in the document #mean = sum / float(len(keyword_cnt)) for val in kw_cnt_vals[:]: #tval = val - mean #var = var + (tval * tval) if val == low or val == high: kw_cnt_vals.remove(val) else: tsum = tsum + val # stddev = sqrt(var / float(len(keyword_cnt) - 1)) if not tsum or not len(kw_cnt_vals): trim_mean = 2.0 else: trim_mean = tsum / float(len(kw_cnt_vals)) stat = trim_mean # Determine scores of words for word,cnt in keyword_cnt.items(): if cnt > stat: keywords[word] = (cnt - stat) / (high - stat) # Beginning to think that this does not pay off ## elif cnt < stat: ## tmp_val = (stat - cnt) / (stat - low) ## if tmp_val > 0.5 and tmp_val < 1.0: ## keywords[word] = 0.3 ## else: ## keywords[word] = 0.1 else: keywords[word] = 0.1 if len(word) > 6: keywords[word] = sqrt(keywords[word]) if word in sec_headings: keywords[word] = sqrt(keywords[word]) if word in prim_headings: keywords[word] = 1.0 if word in title_list: keywords[word] = 1.0 if word in mkw_list: keywords[word] = 1.0 for word,val in keywords.items(): # if not snet_keywords.has_key(word): if val < 0.54: del keywords[word] # +"tour de france" +98 near_list = [] near_d = {} next_d = {} for wordlist in wordblock_list: prev_word = '' for word in wordlist: if keywords.has_key(word): near_list.append(word) if prev_word: t_set = (prev_word,word) if not next_d.has_key(t_set): next_d[t_set] = None prev_word = word else: prev_word = '' for word in near_list[:]: near_list.remove(word) for near_word in near_list: l_set = [word,near_word] l_set.sort() t_set = tuple(l_set) if not near_d.has_key(t_set): near_d[t_set] = None # Set of words next to other words (for snet) next_set = tuple(next_d.keys()) # Set of words near other words (for snet) near_set = tuple(near_d.keys()) ## kw_set = kjSet(keywords.keys()) ## for set in near_set.items(): ## if len(set) != 2: ## del near_set[set] ## elif not set.subset(kw_set): ## del near_set[set] ## for set in next_set.items(): ## if len(set) != 2: ## del next_set[set] ## elif not set.subset(kw_set): ## del next_set[set] connectors = { 'next_to':next_set, 'near':near_set } # TODO # Figure in number of times keyword appears in other snets message = { 'topic':topic, 'url':url, 'title':title, 'keywords':keywords, 'connectors':connectors } self.send_inform('Profiler',message) ## print keywords ## print "****************************************************" ## print message ## print "keywords:",len(keywords) ## print "****************************************************" ## print "near_set:",len(near_set) ## print "****************************************************" ## print "next_set:",len(next_set),next_set ## print "****************************************************" ## print "stddev:",stddev,"trim_mean:",trim_mean,"min:",low,"max:",high NetSerf/fetcher.py 100644 764 764 2460 6544252016 12175 0 ustar jae jae # # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from bot import * import socket def Retriever(bb): asyncore = {} execfile('lib/asyncore.py',asyncore) class Retriever (Agent, asyncore.dispatcher_with_send): def __init__(self,bb): # init dispatcher asyncore.dispatcher_with_send.__init__ (self) # get socket self.create_socket(socket.AF_INET, socket.SOCK_STREAM) # init Agent Agent.__init__(self,bb) def __call__(self): self.connect((host,80)) while asyncore.socket_map: asyncore.poll() NetSerf/html_parser.py 100644 764 764 13654 6544252016 13124 0 ustar jae jae # # Python Agent Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # import string import htmllib from bot import Agent from urlparse import * from formatter import NullFormatter ############################################################################ # myHTMLParser class is derived from htmllib.HTMLParse but to handle images # authors, etc. # Blatently stolen from linbot by Marduk with some modifications by me # to get keywords and headings # Marduk's home page'
self.send_inform('extractor',message)
NetSerf/myRequests.py 100644 764 764 4522 6544252016 12737 0 ustar jae jae
from requests import *
import string
REQUEST_TOP = (
"%(METHOD)s %(URI)s HTTP/1.0\r\n"
"User-Agent: %(AGENT)s\r\n"
"Accept: */*\r\n" )
SE_REQUEST_TOP = REQUEST_TOP + (
"Cookie: %(COOKIE)s\r\n" )
SIMPLE_REQUEST = REQUEST_TOP + REQUEST_END
SEARCH_REQUEST = SE_REQUEST_TOP + REQUEST_END
def Search_Request(uri = "/index.html",
cookie = "",
agent = "Mozilla/3.0 (X11; I; Linux 2.0.33 i586)"):
Dict = { "METHOD":"GET",
"URI":uri,
"AGENT":agent,
"COOKIE":cookie,
"BODY": "" }
if cookie:
return SEARCH_REQUEST % Dict
else:
return SIMPLE_REQUEST % Dict
def Search_Query(url,Dict):
query = urlencode_search_data(Dict)
if query:
uri = "%s?%s" % (url,query)
else:
uri = url
return Search_Request(uri)
def urlencode_search_data(Dict):
"""Encode dictionary as an url encoded query string.
Dict should contain name-->value, where value is
a list only when the name should have multiple values.
"""
from urllib import quote,quote_plus
from types import ListType
pairs = []
for (name, values) in Dict.items():
ename = quote(str(name))
if type(values) != ListType:
evalue = quote(str(values))
pairs.append( "%s=%s" % (ename, evalue) )
else:
tpairs = []
for value in values:
evalue = quote(str(value))
#print evalue
tpairs.append( "%s" % (evalue,) )
vals = quote_plus(string.join(tpairs),'%2b')
vals = "%s=" % (ename,) + vals
pairs.append(vals)
return string.join(pairs, "&")
class GETSearch(GETRequest):
"""
For use with search engines (Infoseek,Altavista), which
require a somewhat more sophistocated method for formatting
the queries. Like the fact that +'s occur in the query and
have to be handled specially (so they aren't interpreted as
spaces).
"""
def Request_Format(self):
return Search_Query(self.uri, self.Dict)
def test(machine=None, uri=None, Dict=None, port=HTTPPORT):
"""
Poke a webcrawler using search request format.
"""
if machine is None:
machine = "www.webcrawler.com"
if uri is None:
uri = "/cgi-bin/WebQuery"
if Dict is None:
Dict = { 'searchText':'+python -monty',
'showSummary':'true',
'start':'0',
'perPage':'100' }
print machine, uri, Dict
request1 = GETSearch(machine, uri, Dict, port)
Looper = Read_Event_Loop()
Looper.verbose = 1
Looper.start_item(request1)
Looper.go_til_done()
if __name__=="__main__":
test()
NetSerf/reporter.py 100644 764 764 7205 6544252016 12421 0 ustar jae jae #
# Python Agent Module
#
# Copyright (c) 1997 John Eikenberry
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU Library General Public
# License along with this library; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA 02111-1307, USA.
#
# The author can be reached by email at jae@ai.uga.edu
#
from bot import Agent,SHELF_DIR
from snet import Snet,NET_SHELF
from whrandom import random
import shelve
class reporter(Agent):
"""
Receives info from url-fether (urllist), display-agent
(scores) or snet-builder (new-snets). Processes this
info into results to display to user, and new scores
to update snet with.
"""
def reporter(self):
# Needs to be initialized for test below
new_scores = {}
# Get message info from either the url-fetcher (if),
# display-agent (elif), or the snet-builder (else)
topic = self.info['topic']
if self.memory.has_key(topic):
mem = self.memory[topic]
else:
mem = {}
if self.info.has_key('urllist'):
urllist = self.info['urllist']
mem['urllist'] = urllist
elif self.info.has_key('scores'):
new_scores = self.info['scores'] # dict
else:
url = self.info['url']
title = self.info['title']
net = self.info['snet']
mem[url] = (title,net)
if mem.has_key('urllist'):
urllist = mem['urllist']
else:
self.memory[topic] = mem
return
# Check to make sure all urls have been parsed before
# getting user feedback on them
## print 'reporter-urllist:',urllist,'
'
for url in urllist:
# Not needed any more since the urllist is cleaned before sent
if mem.has_key(url):
## print 'reporter-GOTIT-url:',url,'
' if not mem[url]: mem['urllist'].remove(url) del mem[url] else: ## print 'reporter-nope-url:',url,'
' ## self.bb.show(self.name) self.memory[topic] = mem return self.memory[topic] = mem # Not needed from this point on del mem['urllist'] # Score pages to determine which to show user # Get shelf of nets net_shelf = shelve.open(NET_SHELF) if net_shelf.has_key(topic): net = net_shelf[topic] else: net = '' # Temporary hack until user feedback interface is done # basically here so I can test the rest of the agent #for url in urllist: # new_scores[url] = random() # Store scores and nets for message to display-agent titles = {} if not new_scores: scores = {} for url,(title,mem_net) in mem.items(): #if url == 'urllist': continue if net: score = net.analyze(mem_net) # if score > 0.1: scores[url] = score titles[url] = title else: scores[url] = 'Not scored' titles[url] = title # Message to display agent message = { 'topic':topic, 'titles':titles, 'scores':scores } addressee = 'display' else: nets = [] for url,(title,mem_net) in mem.items(): if not new_scores.has_key(url): continue if new_scores[url] == '0.0': continue nets.append((new_scores[url],mem_net)) # Send info to snet-maintainer to update the snet message = { 'topic':topic, 'nets':nets } addressee = 'snet_maintainer' del self.memory[topic] self.send_inform(addressee,message) NetSerf/requests.py 100644 764 764 16436 6544252016 12460 0 ustar jae jae """Tools for making http requests.""" HTTPPORT = 80 BLOCKSIZE = 8000 CRLF = "\r\n" REQUEST_TOP = ( "%(METHOD)s %(URI)s HTTP/1.0\r\n" "User-Agent: %(AGENT)s\r\n" "Accept: */*\r\n" ) CT_REQUEST_TOP = REQUEST_TOP + ( "Content-type: %(CONTENT_TYPE)s\r\n" "Content-length: %(CONTENT_LENGTH)s\r\n" ) REQUEST_END = ( "\r\n" \ "%(BODY)s" ) SIMPLE_REQUEST = REQUEST_TOP + REQUEST_END CT_REQUEST = CT_REQUEST_TOP + REQUEST_END import string def Simple_Request(uri = "/index.html", agent = "test_robot"): Dict = {"METHOD":"GET", "URI":uri, "AGENT":agent, "BODY": ""} return SIMPLE_REQUEST % Dict def urlencode_data(Dict): """Encode dictionary as an url encoded query string. Dict should contain name-->value, where value is a list only when the name should have multiple values. """ from urllib import quote from types import ListType pairs = [] for (name, values) in Dict.items(): ename = quote(str(name)) if type(values) != ListType: values = [values] for value in values: evalue = quote(str(value)) pairs.append( "%s=%s" % (ename, evalue) ) return string.joinfields(pairs, "&") def Get_Query(url, Dict): query = urlencode_data(Dict) uri = "%s?%s" % (url, query) return Simple_Request(uri) def ANY_Post_Query(url, query, content_type): Sub = {} Sub["URI"] = url Sub["METHOD"] = "POST" Sub["AGENT"] = "the-dreaded-robot-poster" Sub["CONTENT_TYPE"] = content_type Sub["CONTENT_LENGTH"] = len(query) Sub["BODY"] = query return CT_REQUEST % Sub def Post_Query(url, Dict): query = urlencode_data(Dict) return ANY_Post_Query(url, query, "application/x-www-form-urlencoded") SEGMENT = ("Content-type: text/plain\r\n" 'Content-disposition: form-data; name="%s"\r\n' "\r\n" "%s") def Multi_Post_Query(url, Dict): from mimetools import choose_boundary from urllib import quote from types import ListType boundary = choose_boundary() middle_boundary = "--" + boundary end_boundary = middle_boundary+"--" body = [] for (name, values) in Dict.items(): ename = quote(str(name)) if type(values)!=ListType: values = [values] for value in values: body.append(middle_boundary) body.append(SEGMENT % (ename, value)) body = string.joinfields(body, CRLF) content_type = "multipart/form-data; boundary=%s" % boundary return ANY_Post_Query(url, body, content_type) class GETRequest: blocksize = BLOCKSIZE # overrideable for testing purposes def __init__(self, machine, uri, queryDict, port=HTTPPORT): self.machine = machine self.uri = uri self.Dict = queryDict self.port = port self.sock = None def __repr__(self): return self.__class__.__name__ + \ `(self.machine, self.uri, self.Dict, self.port)` def Request_Format(self): return Get_Query(self.uri, self.Dict) def connect(self): from socket import socket, AF_INET, SOCK_STREAM sock = self.sock = socket(AF_INET, SOCK_STREAM) sock.connect(self.machine, self.port) return sock def send_request(self, sock): request = self.Request_Format() sock.send(request) def get_reply(self, sock): reply = [] while 1: data = sock.recv(self.blocksize) if not data: break reply.append(data) sock.close() return string.joinfields(reply, "") def retrieve(self): sock = self.connect() self.send_request(sock) return self.get_reply(sock) def start(self): """For use with a non-blocking read event loop.""" self.sock = sock = self.connect() self.send_request(sock) self.data_read = [] def try_read(self): """For use with a non-blocking read event loop. returns true when data is done.""" from select import select # make sure self is readable (test, dummy1, dummy2) = select([self.sock], [], [], 0) for sock in test: data = sock.recv(self.blocksize) if not data: return 1 self.data_read.append(data) return 0 def when_done_reading(self): """For use with a nonblocking read event loop. default implementation just prints the read data. If this is used, it will usually need to be overridden. """ data = string.joinfields(self.data_read, "") print data def selectable(self): """the selectable object in self.""" return self.sock class POSTRequest(GETRequest): def Request_Format(self): return Post_Query(self.uri, self.Dict) class Multi_Post_Request(GETRequest): def Request_Format(self): return Multi_Post_Query(self.uri, self.Dict) class Any_Request(GETRequest, POSTRequest, Multi_Post_Request): def __init__(self, behaviour, machine, uri, queryDict, port=HTTPPORT): """behaviour should be one of the superclasses.""" self.behaviour = behaviour GETRequest.__init__(self, machine, uri, queryDict, port) def Request_Format(self): return self.behaviour.Request_Format(self) class Read_Event_Loop: verbose = 0 # override this to get verbose tracing... def __init__(self, timeout=1, *readables): self.timeout = timeout self.init_list = readables self.started = {} def start_all(self): init_list = self.init_list self.init_list = () for readable in init_list: self.start_item(readable) def start_item(self, readable): if self.verbose: print "starting", readable readable.start() self.started[ readable.selectable() ] = readable def try_all_readables(self): from select import select started = self.started selectables = started.keys() if self.verbose: print "selecting", len(selectables) (readavails, dummy1, dummy2) = \ select(selectables, [], [], self.timeout) for readavail in readavails: readable = started[readavail] if self.verbose: print "reading", readable test = readable.try_read() if test: if self.verbose: print "terminating", readable readable.when_done_reading() del started[readavail] def all_done(self): return len(self.started) == 0 def go_til_done(self): while not self.all_done(): self.try_all_readables() def test(machine=None, uri=None, Dict=None, port=HTTPPORT): """Poke a cgi program using all 3 request formats. The cgi program may not recognise all request formats... """ if machine is None: machine = "www.att.com" if uri is None: uri = "/cgi-bin/ATT_WEB/search" if Dict is None: Dict = {"all":"all", "keywords": "Internet services", "weight": "weight", "limit": 2} print machine, uri, Dict request1 = GETRequest(machine, uri, Dict, port) request2 = POSTRequest(machine, uri, Dict, port) request3 = Multi_Post_Request(machine, uri, Dict, port) Looper = Read_Event_Loop() Looper.verbose = 1 Looper.start_item(request1) Looper.start_item(request2) Looper.start_item(request3) Looper.go_til_done() if __name__=="__main__": test() NetSerf/snet.py 100644 764 764 20347 6544252016 11552 0 ustar jae jae # # Python Agents - Semantic Keyword Network Module # # Copyright (c) 1997 John Eikenberry # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU Library General Public # License along with this library; if not, write to the # Free Software Foundation, Inc., 59 Temple Place - Suite 330, # Boston, MA 02111-1307, USA. # # The author can be reached by email at jae@ai.uga.edu # from math import * from bot import SHELF_DIR import shelve,string NET_SHELF = SHELF_DIR + 'snet.shelf' FREQ_SHELF = SHELF_DIR + 'snet_frequency.shelf' class Snet: """ Semantic keyword network. Responsible for both building the networks and using it to anyalyze some text. """ def __init__(self): """ Semantic keyword network. """ self.keywords = {} self.relations = {'next_to':{},'near':{}} self.usable = 0 self.age = 0.0 self.epic = 30.0 def __nonzero__(self): return self.usable def analyze(self,snet): """ Takes a new snet representing a url, returns score for that url. """ rel_score = {} numof_rscores = {} kw_score = 0 numof_kscores = 0 score_list = [] # To figure in how frequent a word/set is used by counting # the number of other snets it occurs in freq_shelf = shelve.open(FREQ_SHELF) kw_freq = freq_shelf['keywords'] set_freq = freq_shelf['sets'] # Get total score of all relations for type, relation in snet.relations.items(): # intersection using tuples current_rels = self.relations[type] #test_rels = tuple(relation.keys()) # Replaced kjSets (switch back?) common_rels = [] for new_rel in relation.keys(): if current_rels.has_key(new_rel): common_rels.append(new_rel) numof_rscores[type] = len(common_rels) rel_score[type] = 0 for set in common_rels: rel_score[type] = rel_score[type] + \ (self.relations[type][set] / set_freq[type][set]) # Get total score of all keywords for keyword in snet.keywords.keys(): if self.keywords.has_key(keyword): kw_score = kw_score + (self.keywords[keyword] / kw_freq[keyword]) numof_kscores = numof_kscores + 1 # Store keyword frequency shelve freq_shelf[FREQ_SHELF] = kw_freq freq_shelf.close() # Average/Normalize scores for type in snet.relations.keys(): if rel_score[type] and numof_rscores[type]: score_list.append(rel_score[type] / numof_rscores[type]) if kw_score and numof_kscores: score_list.append(kw_score / numof_kscores) cum_score = reduce(lambda x,y: x+y, score_list, 0) if cum_score and score_list: score = cum_score / float(len(score_list)) else: score = 0 return score ## if score > 0: ## is_pos = 1 ## else: ## is_pos = 0 ## ## if is_pos: ## return score ## else: ## return 0.0 # def build(self,keywords={},connectors={'next_to':(),'near':()}): """ Takes keywords, connectors and user score, then builds a semantic network for them. keywords = {keyword:score,...} connectors = {'type':con_set,...} where con_set is -> kjSet([kjSet([keyword,keyword]),...]) score = float (user feedback) """ # Build keyword scoring dict #for word in keywords.keys(): # self.keywords[word] = 0 freq_shelf = shelve.open(FREQ_SHELF) if not freq_shelf.has_key('keywords'): freq_shelf['keywords'] = {} set_freq = {'next_to':{},'near':{}} freq_shelf['sets'] = set_freq freq_shelf.close() # Attempt at using count scores to weigh kw scores self.keywords = keywords # Build new net of relation using user's feedback as score for type, relations in connectors.items(): dict = {} for set in relations: ## for fw,sw in relations: ## fws = keywords[fw] ## sws = keywords[sw] ## score = (fws + sws) / 2 dict[set] = 0 if self.relations.has_key(type): self.relations[type] = dict else: raise 'Builder Error:','type of connector not in network' # def update(self,new_snet,new_score): """ Updates the semantic network with a new snets. """ self.usable = 1 new_score = string.atof(new_score) # freq_shelf = shelve.open(FREQ_SHELF) kw_freq = freq_shelf['keywords'] set_freq = freq_shelf['sets'] age = self.age / self.epic self.age = self.age + 1 if age < 0.2: age = 0.2 elif age == 1.0: if self.epic > 400: self.age = 220 else: self.epic = self.epic * 2.0 if __debug__: print "age->",age new_snet.show() set_dellist = [] kw_dellist = [] # Calculate new scores of relations for type, relations in new_snet.relations.items(): for set,rscore in relations.items(): new_set = 0 #wgt_new_score = new_score * (1.5 - rscore) if self.relations[type].has_key(set): score = self.relations[type][set] tmp_score = (score + ((new_score - score) * \ pow(pow((10*age),1.5), -age))) self.relations[type][set] = tmp_score else: self.relations[type][set] = new_score new_set = 1 if self.relations[type][set] < 0.110 \ and self.relations[type][set] > -0.110: set_dellist.append((type,set)) elif new_set: if set_freq[type].has_key(set): set_freq[type][set] = set_freq[type][set] + 0.1 else: set_freq[type][set] = 1.0 # Calculate new scores of keywords for keyword in new_snet.keywords.keys(): new_kw = 0 if self.keywords.has_key(keyword): score = self.keywords[keyword] self.keywords[keyword] = (score + ((new_score - score) * \ pow(pow((10*age),1.5), -age))) else: self.keywords[keyword] = new_score new_kw = 1 if self.keywords[keyword] < 0.110 \ and self.keywords[keyword] > -0.110: kw_dellist.append(keyword) elif new_kw: if kw_freq.has_key(keyword): kw_freq[keyword] = kw_freq[keyword] + 0.1 else: kw_freq[keyword] = 1.0 freq_shelf['keywords'] = kw_freq freq_shelf['sets'] = set_freq freq_shelf.close() # Remove insignificant keywords and sets if (age > 0.4 and age < 0.5) or (age > 0.75): for type,set in set_dellist: del self.relations[type][set] for kw in kw_dellist: del self.keywords[kw] # def get_keywords(self): """ Returns list of keywords. """ hs1 = hs2 = hs3 = 0.0 ls = 0.0 wd1 = wd2 = wd3 = '' for word,score in self.keywords.items(): if score > hs1: hs1 = score wd1 = word elif score > hs2: hs2 = score wd2 = word elif score < ls: ls = score wd3 = word # In case there are no negative scores yet elif not ls and score > hs3: hs3 = score wd3 = word top_kws = '' if wd1: top_kws = top_kws + "+%s " % wd1 if wd2: top_kws = top_kws + "+%s " % wd2 if wd3: top_kws = top_kws + "-%s " % wd3 hs = 0.0 ls = 0.0 low_set = high_set = () for set,score in self.relations['next_to'].items(): if score > hs: high_set = set elif score < ls: low_set = set if abs(ls) > hs: next_to = '-' + string.join(low_set) else: next_to = '+' + string.join(high_set) hs = 0.0 ls = 1.0 low_set = high_set = () for set,score in self.relations['near'].items(): if score > hs: high_set = set elif score < ls: low_set = set if abs(ls) > hs: near = '-' + string.join(low_set) else: near = '+' + string.join(high_set) return (top_kws,next_to,near) def html_show(self): print "