Posted by signal on Wed 23 Jul 04:17
report abuse | download | new post
- def MeFiTags(path='tagdata_mefi.txt'):
- tags = {}
- taglist = {}
- data = open(path)
- data.readline()
- data.readline()
- freq_max = 0
- factor_min = 0.01
- factor_max = 1.0
- for line in data:
- (tag_id, link_id, link_date, tag_name) = line.split('\t')
- tag = tag_name[:-1].lower()
- if "brokenlink" in tag:
- continue
- tags.setdefault(link_id,[])
- tags[link_id].append(tag)
- frequency = taglist.get(tag,0)
- if frequency>freq_max:
- freq_max = frequency
- taglist[tag] = frequency + 1
- computed_min = freq_max * factor_min
- computed_max = freq_max * factor_max
- taglist_inrange = []
- ct = 0
- for tag in taglist:
- if computed_min < taglist[tag] < computed_max:
- taglist_inrange.append(tag)
- ct += 1
- elif taglist[tag]>computed_max:
- print tag, taglist[tag]
- print "total tags in range", ct
- # 'tags' dict link_id => list of tag_names
- # 'taglist_inrange' list all tags within max and min
- return tags, taglist_inrange
- def MeFiUsers(path='usernames.txt'):
- users ={}
- data = open(path)
- data.readline()
- data.readline()
- for line in data:
- (userid, joindate, name)= line.split('\t')
- users[userid]=name[:-1]
- # dict userid => username
- return users
- def MeFiPosts(path='postdata_mefi.txt'):
- # dict
- posts = {}
- min_posts = 5
- data = open(path)
- # jump 2 lines
- data.readline()
- data.readline()
- for line in data:
- (postid, userid)=line.split('\t')[:2]
- posts.setdefault(userid,[])
- posts[userid].append(postid)
- posts_valid = {}
- for userid in posts:
- if len(posts[userid]) > min_posts:
- posts_valid[userid]=posts[userid]
- # dict userid => list of postids
- return posts_valid
- def writeMefiUserTags():
- tags, taglist = MeFiTags()
- users = MeFiUsers()
- posts = MeFiPosts()
- usertags = {}
- out = file('mefidata.txt','w')
- w = out.write
- w('User')
- for tag in taglist:
- w('\t%s' % tag)
- w('\n')
- userct = ct = 0
- for user_id in users:
- if posts.has_key(user_id): # if user has posted
- w(users[user_id]) # write user name
- user_posts = posts[user_id] # get user's posts
- user_tags = {}
- for post_id in user_posts:
- if tags.has_key(post_id): # if post has tags
- for tag in tags[post_id]: # for each tag
- user_tags.setdefault(tag,0)
- user_tags[tag]+=1 # increment user's score
- for tag in taglist:
- if user_tags.has_key(tag): # if user has tag
- w('\t%d' % user_tags[tag]) # print score
- else:
- w('\t0') # print 0
- w('\n')
- if ct%100 == 0:
- print "%s users written" % ct
- ct += 1
- userct += 1
- out.close()
- print userct, "users"
- if __name__ == "__main__":
- writeMefiUserTags()
Submit a correction or amendment below (click here to make a fresh posting)
After submitting an amendment, you'll be able to view the differences between the old and new posts easily.