在Python中获取特定Reddit主题的所有评论

17

官方方式,

r = praw.Reddit('Comment Scraper 1.0 by u/_Daimon_ see '
                 'https://praw.readthedocs.org/en/latest/'
                 'pages/comment_parsing.html')
submission = r.get_submission(submission_id='11v36o')
submission.replace_more_comments(limit=None, threshold=0)

速度极慢。有什么方法可以加快速度吗?有些人已经将每个Reddit评论提取到数据库中,因此一定有某种更快的方法。


1
为什么要踩我?我在其他地方找不到答案(如果有,请给我提供链接)。 - Phylliida
1
专业提示:如果您只是在URL后面添加'.json',则可以将任何Reddit页面获取为JSON格式。然后,您可以在一个'while True'子句中循环,以便在浏览帖子的页面时从数据结构中获取评论。无需登录或其他任何操作。此外,我并没有给您投反对票,但我无法帮助您撤销该票,因为我已经用完了我的投票次数:( - Ulf Aslak
那其实并不是真的,JSON 包括这些“更多评论”的内容,要正确地展开这些内容非常复杂(在有更多评论的情况下)。 - Phylliida
1
不是很专业的技巧,哈哈。感谢您让我知道! - Ulf Aslak
我们应该将什么设置为 submission_id?你刚刚随意给它设置了一个项吗? - Mona Jalal
3个回答

28

编辑:新的praw api(6.0.0)具有列表()使工作更加容易:

这也通过使用replace_more(limit = None)来处理可能由于more_comments 而发生的AttributeError

submissionList = []
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    submissionList.append(comment)

编辑:新的praw api(5.0.1)非常神奇,使这个过程变得更加容易。现在可以这样做:

def getSubComments(comment, allComments, verbose=True):
  allComments.append(comment)
  if not hasattr(comment, "replies"):
    replies = comment.comments()
    if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)")
  else:
    replies = comment.replies
  for child in replies:
    getSubComments(child, allComments, verbose=verbose)


def getAll(r, submissionId, verbose=True):
  submission = r.submission(submissionId)
  comments = submission.comments
  commentsList = []
  for comment in comments:
    getSubComments(comment, commentsList, verbose=verbose)
  return commentsList

示例用法:

res = getAll(r, "6rjwo1")
#res = getAll(r, "6rjwo1", verbose=False) # This won't print out progress if you want it to be silent. Default is verbose=True

其中 r 表示

username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"
r = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

以前的内容(已过时):

好的,我编写了一段代码,可以可靠地从一个主题中提取每个评论,并且对于500条评论大约需要10秒,对于4000条评论则需要约一分钟。我将其命名为redApi.py。以下是代码:

import time
import requests
import requests.auth
import praw

username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"

def getPraw():
  return praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

global accessToken
accessToken = None

def getAccessToken():
  client_auth = requests.auth.HTTPBasicAuth(clientId, clientSecret)
  post_data = {"grant_type": "password", "username": username, "password": password}
  headers = {"User-Agent": userAgent}
  response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
  return response.json()

def makeRequest(apiUrl, useGet=True):
  global accessToken
  if accessToken is None:
    accessToken = getAccessToken()
  headers = {"Authorization": "bearer "  + accessToken['access_token'], "User-Agent": userAgent}
  if useGet:
    response = requests.get(apiUrl, headers=headers)
  else:
    response = requests.post(apiUrl, headers=headers)
  time.sleep(1.1)
  responseJson = response.json()
  if 'error' in responseJson:
    if responseJson['error'] == 401:
      print "Refreshing access token"
      time.sleep(1.1)
      accessToken = getAccessToken()
      headers = {"Authorization": "bearer "  + accessToken['access_token'], "User-Agent": userAgent}
      time.sleep(1.1)
      response = requests.get(apiUrl, headers=headers)
      responseJson = response.json()
  return responseJson


global prawReddit
prawReddit = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

# Gets any number of posts
def getPosts(subredditName, numPosts=1000):
  global prawReddit
  subreddit = prawReddit.get_subreddit(subredditName)

  postGetter = praw.helpers.submissions_between(prawReddit, subreddit)

  postArray = []
  numGotten = 0
  while numGotten < numPosts:
    postArray.append(postGetter.next())
    numGotten += 1

  return postArray






# Get all comments from a post
# Submission is a praw submission, obtained via:
# r = redApi.getPraw()
# submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
# comments = redApi.getComments(submission)
def getComments(submission):
  requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id
  allData = makeRequest(requestUrl)
  articleData = allData[0]
  comments = allData[1]
  curComments = comments['data']['children']

  resultComments = getCommentsHelper(curComments, submission.name, submission)

  return resultComments




# Print out the tree of comments
def printTree(comments):
  return printTreeHelper(comments, "")


def printTreeHelper(comments, curIndentation):
  resultString = ""
  for comment in comments:
    resultString += curIndentation + comment['data']['body'].replace("\n", "\n" + curIndentation) + "\n"
    if not comment['data']['replies'] == "":
      resultString += printTreeHelper(comment['data']['replies']['data']['children'], curIndentation + "  ")
  return resultString

# Get all comments as a single array  
def flattenTree(comments):
  allComments = []
  for comment in comments:
    allComments.append(comment)
    if not comment['data']['replies'] == "":
      allComments += flattenTree(comment['data']['replies']['data']['children'])
  return allComments





# Utility functions for getComments
def expandCommentList(commentList, submission):

  curComments = commentList
  allComments = {}
  while True:
    thingsToExpand = []
    nextComments = []
    allParents = {}
    for comment in curComments:
      if comment['kind'] == "more":
        thingsToExpand += comment['data']['children']
      else:
        if comment['data']['body'][:len("If they are shipping")] == "If they are shipping":
          print comment
        allComments[comment['data']['name']] = comment

    if len(thingsToExpand) == 0:
      curComments = []
      break

    curComments = []
    if not len(thingsToExpand) == 0:
      print "total things to expand: " + str(len(thingsToExpand))
      for i in range(0, len(thingsToExpand)/100+1):
        curCommentIds = thingsToExpand[i*100:min((i+1)*100, len(thingsToExpand))]
        requestUrl = 'https://oauth.reddit.com/api/morechildren.json?api_type=json&link_id=' + submission.name + '&limit=1000&showmore=true&children=' + ",".join(curCommentIds)
        curData = makeRequest(requestUrl)
        if 'json' in curData and 'data' in curData['json']:
          curComments += curData['json']['data']['things']
        print (i+1)*100


  for comment in curComments:
    allComments[comment['data']['name']] = comment

  return allComments.values()


def lookForMore(comment):
  if comment['kind'] == "more":
    return True
  if not comment['data']['replies'] == "":
    for reply in comment['data']['replies']['data']['children']:
      if lookForMore(reply):
        return True
  return False

def getCommentsHelper(curComments, rootId, submission):

  allComments = expandCommentList(curComments, submission)

  commentMap = {}
  for comment in allComments:
    commentMap[comment['data']['name']] = comment


  allRootComments = []
  for comment in allComments:
    if comment['data']['parent_id'] == rootId:
      allRootComments.append(comment)
    elif comment['data']['parent_id'] in commentMap:
      parentComment = commentMap[comment['data']['parent_id']]
      if parentComment['data']['replies'] == "":
        parentComment['data']['replies'] = {'data': {'children': []}}
      alreadyChild = False
      for childComment in parentComment['data']['replies']['data']['children']:
        if childComment['data']['name'] == comment['data']['name']:
          alreadyChild = True
          break
      if not alreadyChild:
        parentComment['data']['replies']['data']['children'].append(comment)
    else:
      print "pls halp"

  completedComments = []
  needMoreComments = []

  for comment in allRootComments:
    if not comment['data']['replies'] == "" or comment['kind'] == 'more':
      hasMore = lookForMore(comment)

      if hasMore:
        needMoreComments.append(comment)
      else:
        replyComments = getCommentsHelper(comment['data']['replies']['data']['children'], comment['data']['name'], submission)
        comment['data']['replies']['data']['children'] = replyComments
        completedComments.append(comment)
    else:
      completedComments.append(comment)
  for comment in needMoreComments:
    requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id + "&comment=" + comment['data']['id']
    allData = makeRequest(requestUrl)
    articleData = allData[0]
    comment = allData[1]['data']['children'][0]
    if comment['data']['replies'] == "":
      completedComments.append(comment)
    else:
      comments = comment['data']['replies']['data']['children']
      actualComments = getCommentsHelper(comments, comment['data']['name'], submission)
      comment['data']['replies']['data']['children'] = actualComments
      completedComments.append(comment)

  return completedComments

要使用这个脚本,在Python提示符中输入以下内容:
# Get all comments from a post
# Submission is a praw submission, obtained via:
r = redApi.getPraw()
submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
comments = redApi.getComments(submission)

1
要使用此功能,您需要从这里安装PRAW。您需要下载zip文件并执行python setup.py install命令,因为pip和easy_install版本不同且功能不同。虽然这也可能在这些版本中完成,但是所有文档都是针对此版本的。 - Phylliida
1
同时,遗憾的是,这似乎只适用于评论数在5000以下的帖子。对于更大的帖子,它会出现问题并抛出错误。我正在解决这个问题。 - Phylliida
当我运行你的代码时,出现了“KeyError: 'access_token'”错误。你知道如何修复吗? - Mona Jalal
非常有帮助!谢谢! - Zaya
看起来你的praw 5.0.1使用dfs正确地给出了评论树。最新版本(使用列表的版本)只会首先给出顶级评论,然后给出下一级评论,以此类推。 - HyukHyukBoi
像个老板一样处理:非常感谢! - user2550228

5

看起来praw已经更新了?在4.5.1版本中,它看起来更像:

#!/usr/local/bin/python
import praw

reddit = praw.Reddit(
    client_id='<client_id>',
    client_secret='<client_secret>',
    user_agent='davehodg/0.1')

submission = reddit.submission(id='<submission_id>')
sumbission = reddit.submission(url='<submission_url>') #in case you don't have submission id

for comment in submission.comments.list():
    print(comment.body)

编辑:看起来我能得到的最多评论数量是1000个?

注:该评论可能与IT技术无关。


1
我正在添加一些打印和调试,但现在 @danielle 的脚本什么也没有做。只是返回到提示符。

2
我还添加了如何使用它的说明。很抱歉如果之前没有说清楚需要在脚本中调用一个函数。 - Phylliida
2
此外,在脚本顶部,您需要填写相关的Reddit API开发者详细信息。 - Phylliida

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接