在Python中获取特定Reddit主题的所有评论

Question

在Python中获取特定Reddit主题的所有评论

17

官方方式，

r = praw.Reddit('Comment Scraper 1.0 by u/_Daimon_ see '
                 'https://praw.readthedocs.org/en/latest/'
                 'pages/comment_parsing.html')
submission = r.get_submission(submission_id='11v36o')
submission.replace_more_comments(limit=None, threshold=0)

速度极慢。有什么方法可以加快速度吗？有些人已经将每个Reddit评论提取到数据库中，因此一定有某种更快的方法。

- Phylliida

1

为什么要踩我？我在其他地方找不到答案（如果有，请给我提供链接）。 - Phylliida

1

专业提示：如果您只是在URL后面添加'.json'，则可以将任何Reddit页面获取为JSON格式。然后，您可以在一个'while True'子句中循环，以便在浏览帖子的页面时从数据结构中获取评论。无需登录或其他任何操作。此外，我并没有给您投反对票，但我无法帮助您撤销该票，因为我已经用完了我的投票次数:( - Ulf Aslak

那其实并不是真的，JSON 包括这些“更多评论”的内容，要正确地展开这些内容非常复杂（在有更多评论的情况下）。 - Phylliida

1

不是很专业的技巧，哈哈。感谢您让我知道！ - Ulf Aslak

我们应该将什么设置为 submission_id？你刚刚随意给它设置了一个项吗？ - Mona Jalal

3个回答

5

看起来praw已经更新了？在4.5.1版本中，它看起来更像：

#!/usr/local/bin/python
import praw

reddit = praw.Reddit(
    client_id='<client_id>',
    client_secret='<client_secret>',
    user_agent='davehodg/0.1')

submission = reddit.submission(id='<submission_id>')
sumbission = reddit.submission(url='<submission_url>') #in case you don't have submission id

for comment in submission.comments.list():
    print(comment.body)

编辑：看起来我能得到的最多评论数量是1000个？

注：该评论可能与IT技术无关。

- Dave Hodgkinson

1

我正在添加一些打印和调试，但现在 @danielle 的脚本什么也没有做。只是返回到提示符。

- tek0011

2

我还添加了如何使用它的说明。很抱歉如果之前没有说清楚需要在脚本中调用一个函数。 - Phylliida

2

此外，在脚本顶部，您需要填写相关的Reddit API开发者详细信息。 - Phylliida

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Phylliida · Accepted Answer

编辑：新的praw api（6.0.0）具有列表（）使工作更加容易：

这也通过使用replace_more（limit = None）来处理可能由于more_comments 而发生的AttributeError

submissionList = []
submission.comments.replace_more(limit=None)
for comment in submission.comments.list():
    submissionList.append(comment)

编辑：新的praw api（5.0.1）非常神奇，使这个过程变得更加容易。现在可以这样做：

def getSubComments(comment, allComments, verbose=True):
  allComments.append(comment)
  if not hasattr(comment, "replies"):
    replies = comment.comments()
    if verbose: print("fetching (" + str(len(allComments)) + " comments fetched total)")
  else:
    replies = comment.replies
  for child in replies:
    getSubComments(child, allComments, verbose=verbose)


def getAll(r, submissionId, verbose=True):
  submission = r.submission(submissionId)
  comments = submission.comments
  commentsList = []
  for comment in comments:
    getSubComments(comment, commentsList, verbose=verbose)
  return commentsList

示例用法：

res = getAll(r, "6rjwo1")
#res = getAll(r, "6rjwo1", verbose=False) # This won't print out progress if you want it to be silent. Default is verbose=True

其中 r 表示

username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"
r = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

以前的内容（已过时）：

好的，我编写了一段代码，可以可靠地从一个主题中提取每个评论，并且对于500条评论大约需要10秒，对于4000条评论则需要约一分钟。我将其命名为redApi.py。以下是代码：

import time
import requests
import requests.auth
import praw

username = 'myusernamehere'
userAgent = "MyAppName/0.1 by " + username
clientId = 'myClientId'
clientSecret = "myClientSecret"
password = "passwordformyusernamehere"

def getPraw():
  return praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

global accessToken
accessToken = None

def getAccessToken():
  client_auth = requests.auth.HTTPBasicAuth(clientId, clientSecret)
  post_data = {"grant_type": "password", "username": username, "password": password}
  headers = {"User-Agent": userAgent}
  response = requests.post("https://www.reddit.com/api/v1/access_token", auth=client_auth, data=post_data, headers=headers)
  return response.json()

def makeRequest(apiUrl, useGet=True):
  global accessToken
  if accessToken is None:
    accessToken = getAccessToken()
  headers = {"Authorization": "bearer "  + accessToken['access_token'], "User-Agent": userAgent}
  if useGet:
    response = requests.get(apiUrl, headers=headers)
  else:
    response = requests.post(apiUrl, headers=headers)
  time.sleep(1.1)
  responseJson = response.json()
  if 'error' in responseJson:
    if responseJson['error'] == 401:
      print "Refreshing access token"
      time.sleep(1.1)
      accessToken = getAccessToken()
      headers = {"Authorization": "bearer "  + accessToken['access_token'], "User-Agent": userAgent}
      time.sleep(1.1)
      response = requests.get(apiUrl, headers=headers)
      responseJson = response.json()
  return responseJson


global prawReddit
prawReddit = praw.Reddit(user_agent=userAgent, client_id=clientId, client_secret=clientSecret)

# Gets any number of posts
def getPosts(subredditName, numPosts=1000):
  global prawReddit
  subreddit = prawReddit.get_subreddit(subredditName)

  postGetter = praw.helpers.submissions_between(prawReddit, subreddit)

  postArray = []
  numGotten = 0
  while numGotten < numPosts:
    postArray.append(postGetter.next())
    numGotten += 1

  return postArray






# Get all comments from a post
# Submission is a praw submission, obtained via:
# r = redApi.getPraw()
# submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
# comments = redApi.getComments(submission)
def getComments(submission):
  requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id
  allData = makeRequest(requestUrl)
  articleData = allData[0]
  comments = allData[1]
  curComments = comments['data']['children']

  resultComments = getCommentsHelper(curComments, submission.name, submission)

  return resultComments




# Print out the tree of comments
def printTree(comments):
  return printTreeHelper(comments, "")


def printTreeHelper(comments, curIndentation):
  resultString = ""
  for comment in comments:
    resultString += curIndentation + comment['data']['body'].replace("\n", "\n" + curIndentation) + "\n"
    if not comment['data']['replies'] == "":
      resultString += printTreeHelper(comment['data']['replies']['data']['children'], curIndentation + "  ")
  return resultString

# Get all comments as a single array  
def flattenTree(comments):
  allComments = []
  for comment in comments:
    allComments.append(comment)
    if not comment['data']['replies'] == "":
      allComments += flattenTree(comment['data']['replies']['data']['children'])
  return allComments





# Utility functions for getComments
def expandCommentList(commentList, submission):

  curComments = commentList
  allComments = {}
  while True:
    thingsToExpand = []
    nextComments = []
    allParents = {}
    for comment in curComments:
      if comment['kind'] == "more":
        thingsToExpand += comment['data']['children']
      else:
        if comment['data']['body'][:len("If they are shipping")] == "If they are shipping":
          print comment
        allComments[comment['data']['name']] = comment

    if len(thingsToExpand) == 0:
      curComments = []
      break

    curComments = []
    if not len(thingsToExpand) == 0:
      print "total things to expand: " + str(len(thingsToExpand))
      for i in range(0, len(thingsToExpand)/100+1):
        curCommentIds = thingsToExpand[i*100:min((i+1)*100, len(thingsToExpand))]
        requestUrl = 'https://oauth.reddit.com/api/morechildren.json?api_type=json&link_id=' + submission.name + '&limit=1000&showmore=true&children=' + ",".join(curCommentIds)
        curData = makeRequest(requestUrl)
        if 'json' in curData and 'data' in curData['json']:
          curComments += curData['json']['data']['things']
        print (i+1)*100


  for comment in curComments:
    allComments[comment['data']['name']] = comment

  return allComments.values()


def lookForMore(comment):
  if comment['kind'] == "more":
    return True
  if not comment['data']['replies'] == "":
    for reply in comment['data']['replies']['data']['children']:
      if lookForMore(reply):
        return True
  return False

def getCommentsHelper(curComments, rootId, submission):

  allComments = expandCommentList(curComments, submission)

  commentMap = {}
  for comment in allComments:
    commentMap[comment['data']['name']] = comment


  allRootComments = []
  for comment in allComments:
    if comment['data']['parent_id'] == rootId:
      allRootComments.append(comment)
    elif comment['data']['parent_id'] in commentMap:
      parentComment = commentMap[comment['data']['parent_id']]
      if parentComment['data']['replies'] == "":
        parentComment['data']['replies'] = {'data': {'children': []}}
      alreadyChild = False
      for childComment in parentComment['data']['replies']['data']['children']:
        if childComment['data']['name'] == comment['data']['name']:
          alreadyChild = True
          break
      if not alreadyChild:
        parentComment['data']['replies']['data']['children'].append(comment)
    else:
      print "pls halp"

  completedComments = []
  needMoreComments = []

  for comment in allRootComments:
    if not comment['data']['replies'] == "" or comment['kind'] == 'more':
      hasMore = lookForMore(comment)

      if hasMore:
        needMoreComments.append(comment)
      else:
        replyComments = getCommentsHelper(comment['data']['replies']['data']['children'], comment['data']['name'], submission)
        comment['data']['replies']['data']['children'] = replyComments
        completedComments.append(comment)
    else:
      completedComments.append(comment)
  for comment in needMoreComments:
    requestUrl = 'https://oauth.reddit.com/' + submission.subreddit.url + 'comments/article?&limit=1000&showmore=true&article=' + submission.id + "&comment=" + comment['data']['id']
    allData = makeRequest(requestUrl)
    articleData = allData[0]
    comment = allData[1]['data']['children'][0]
    if comment['data']['replies'] == "":
      completedComments.append(comment)
    else:
      comments = comment['data']['replies']['data']['children']
      actualComments = getCommentsHelper(comments, comment['data']['name'], submission)
      comment['data']['replies']['data']['children'] = actualComments
      completedComments.append(comment)

  return completedComments

要使用这个脚本，在Python提示符中输入以下内容：

# Get all comments from a post
# Submission is a praw submission, obtained via:
r = redApi.getPraw()
submission = r.get_submission(submission_id='2zysz7') # (or some other submission id, found via https://www.reddit.com/r/test/comments/2zysz7/ayy/ - the thing after /comments/)
comments = redApi.getComments(submission)