UnicodeDecodeError: 'ascii'编解码器无法解码位于位置的0xec字节。

3

当我在Ubuntu上使用python2.7和MySQLdb进行编程时,如果在python中使用其他语言,就会出现错误。仅使用英语不会出现此错误。

Traceback (most recent call last):    
      File "crawl.py", line 242, in <module>
        parseArticle( u )     
      File "crawl.py", line 146, in parseArticle 
          gatherNeighborInfo( soup )      
      File "crawl.py", line 69, in gatherNeighborInfo
        db.updateURL( url , '자신의 글 주소들을 db에 저장합니다' )     
      File "crawl.py", line 211, in updateURL  self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url)) 
UnicodeDecodeError: 'ascii' codec can't decode byte 0xec in position 33: ordinal not in range(128)

我尝试将ASCII转换为UTF-8。我在 /usr/local/lib/python2.7/site-packages/ 下创建了一个名为 sitecustomize.py 的文件,其源代码如下:

import sys
sys.setdefaultencoding("utf-8")

但是什么都没有改变。请帮我。 这是整个源代码。
 # -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
import robotparser
import urllib2
import time, traceback, re
import MySQLdb

crawler_name = 'daum_blog_crawler'      
mainpage = 'http://blog.daum.net/'      

# robot parser setting.
rp = robotparser.RobotFileParser( mainpage + 'robots.txt' )
rp.read()

def canFetch( url ):
        return rp.can_fetch( crawler_name, url )

def getContent( url, delay=1):
        time.sleep( delay )

        if not canFetch( url ):
                #print 'This url can NOT be fetched by our crawler :', url
                return None
        try:
                opener = urllib2.build_opener()
                opener.addheaders = [('User-agent',crawler_name)]
                contents = opener.open(url).read()
        except:
                traceback.print_exc()
                return None
        return contents

def getArticleInfo( soup ):

        rBlog = re.compile('.+blog.daum.net/\w+/\d+.*?')
        URLs = soup('a',{'href':rBlog})

        return [ u.get('href').split('?')[0] for u in URLs ]

def getOwnArticles( contents ):
        ret = []
        soup = BeautifulSoup( contents )
        rBlog = re.compile('.+/BlogTypeView.+')
        for u in soup('a',{'href':rBlog}):
                href = u.get('href')
                article = href.split('articleno=')[1].split('&')[0]
                if ret.count(article)<1:
                        ret.append( article )
        return ret

def gatherNeighborInfo( soup ):


        rBlog = re.compile('http://blog.daum.net/\w+')
        Neighbors = soup('a',{'href':rBlog})
        cnt = 0
        for n in Neighbors:
                url = n.get('href')
                blogname = url.split('/')[-1]
                if url and url.startswith('http://') and db.isCrawledURL(url)<1:
                        db.insertURL( url, 1 ) 
            db.updateURL( url , '자신의 글 주소들을 db에 저장합니다' )

                        url2 = getRedirectedURL( url )
                        if not url2: continue
                        re_url = 'http://blog.daum.net' + url2
                        body = getContent( re_url, 0 ) 
                        if body:
                                for u in getOwnArticles( body ):

                                        fullpath = 'http://blog.daum.net/'+blogname+'/'+u
                                        cnt+=db.insertURL( fullpath )

        if cnt>0: print '%d neighbor articles inserted'%cnt

def getRedirectedURL( url ):

        contents = getContent( url )
        if not contents: return None

        #redirect
        try:
                soup = BeautifulSoup( contents )
                frame = soup('frame')           
                src = frame[0].get('src')
        except:
                src = None
        return src

def getBody( soup, parent ):

        rSrc = re.compile('.+/ArticleContentsView.+')
        iframe = soup('iframe',{'src':rSrc})
        if len(iframe)>0: 
                src = iframe[0].get('src')
                iframe_src = 'http://blog.daum.net'+src


                req = urllib2.Request( iframe_src )
                req.add_header('Referer', parent )
                body = urllib2.urlopen(req).read()
                soup = BeautifulSoup( body )
                return str(soup.body)
        else:
                print 'NULL contents'
                return ''

def parseArticle( url ):

        article_id = url.split('/')[-1]
        blog_id = url.split('/')[-2]

        if blog_id.isdigit():
                print 'digit:', url.split('/')


        newURL = getRedirectedURL( url )

        if newURL:

                newURL = 'http://blog.daum.net'+newURL
                print 'redirecting', newURL
                contents = getContent( newURL, 0 )
                if not contents:
                        print 'Null Contents...'

                        db.updateURL( url, -1 )
                        return


                soup = BeautifulSoup( contents )


                gatherNeighborInfo( soup )              


                n=0
                for u in getArticleInfo( soup ):
                        n+=db.insertURL( u )
                if n>0: print 'inserted %d urls from %s'%(n,url)

                sp = contents.find('<title>')
                if sp>-1:
                        ep = contents[sp+7:].find('</title>')
                        title = contents[sp+7:sp+ep+7]
                else:
                        title = ''

                contents = getBody( soup, newURL )  


                pStyle = re.compile('<style(.*?)>(.*?)</style>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
                contents = pStyle.sub('', contents)

        pStyle = re.compile('<script(.*?)>(.*?)</script>', re.IGNORECASE | re.MULTILINE | re.DOTALL )
                contents = pStyle.sub('', contents)

        pStyle = re.compile("<(.*?)>", re.IGNORECASE | re.MULTILINE | re.DOTALL )
                contents = pStyle.sub("", contents)


                db.updateURL( url , '처리했다고 db에 표시합니다.' )

        else:
                print 'Invalid blog article...'

                db.updateURL( url, 'None', -1 )

class DB:
        "MySQL wrapper class"
        def __init__(self):
                self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='......')
                self.cursor = self.conn.cursor()
                self.cursor.execute('CREATE TABLE IF NOT EXISTS urls(url CHAR(150), state INT, content TEXT)')
        def commit(self):
                self.conn.commit()
        def __del__(self):
                self.conn.commit()
                self.cursor.close()

        def insertURL(self, url, state=0, content=None):
                if url[-1]=='/': url=url[:-1]
                try:    
                        self.cursor.execute("INSERT INTO urls VALUES ('%s',%d,'%s')"%(url,state,content))
                except:
                        return 0
                else:
                        return 1

        def selectUncrawledURL(self):
                self.cursor.execute('SELECT * FROM urls where state=0')
                return [ row[0] for row in self.cursor.fetchall() ]

        def updateURL(self, url, content, state=1):
                if url[-1]=='/': url=url[:-1]
                self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))

        def isCrawledURL(self, url):
                if url[-1]=='/': url=url[:-1]
                self.cursor.execute("SELECT COUNT(*) FROM urls WHERE url='%s' AND state=1"%url)
                ret = self.cursor.fetchone()
                return ret[0]

db = DB()

if __name__=='__main__':
        print 'starting crawl.py...'

        contents = getContent( mainpage )
        URLs = getArticleInfo( BeautifulSoup( contents ) )
        nSuccess = 0
        for u in URLs:
                nSuccess += db.insertURL( u )
        print 'inserted %d new pages.'%nSuccess

        while 1:
                uncrawled_urls = db.selectUncrawledURL()
                if not uncrawled_urls: break
                for u in uncrawled_urls: 

                        print 'downloading %s'%u
                        try:
                                parseArticle( u )
                        except:
                                traceback.print_exc()
                                db.updateURL( u, -1 )
                        db.commit()
                #bs.UpdateIndex()

我怀疑你以ASCII模式打开了数据库,或者将其初始化为以ASCII运行。 - Steve Barnes
永远不要修改Python的默认编码。 - user2665694
哦,我的MySQL模式是utf8。我已经使用default-character-set = utf8进行了更改。 - Moon Taejin
但是在那个改变之后,你重新创建了crawlDB吗?数据库是粘性的! - Steve Barnes
我现在按照你的指令重新创建了crawlDB,但是出现了相同的错误。不过有一个不同的地方。'ascii'错误变成了'latin-1'。问题出在哪里呢?我确实将数据库字符集更改为utf8_unicode_ci。 - Moon Taejin
3个回答

2

在连接时指定charset

self.conn = MySQLdb.connect(db='crawlDB', user='root', passwd='......', charset='utf8')

替换以下行:
self.cursor.execute("UPDATE urls SET state=%d,content='%s' WHERE url='%s'"%(state,content,url))

使用 (将SQL和参数分开):

self.cursor.execute("UPDATE urls SET state=%s, content=%s WHERE url=%s", (state,content,url))

示例会话:

>>> import MySQLdb
>>> db = MySQLdb.connect('localhost', db='test', charset='utf8')
>>> cursor = db.cursor()
>>> cursor.execute('DROP TABLE IF EXISTS urls')
0L
>>> cursor.execute('CREATE TABLE urls(url char(200), state int, content text)')
0L
>>> cursor.execute('INSERT INTO urls(url, state, content) VALUES(%s, %s, %s)', ('http://daum.net/', 1, u'\uc548\ub155'))
1L
>>> cursor.execute('SELECT * FROM urls')
1L
>>> for row in cursor.fetchall():
...     print row
...
(u'http://daum.net/', 1L, u'\uc548\ub155')

@MoonTaejin,请检查您的数据库状态(尤其是字符集)。这是我的 - falsetru
falsetru,我已经检查了我的数据库状态,但和你的一样。仍然出现UnicodeDecodeError错误。 - Moon Taejin
@MoonTaejin,我录制了一个演示屏幕录像。希望对你有帮助。 - falsetru
@MoonTaejin,你解决了这个问题吗?问题的原因是什么? - falsetru
哦,我的问题是单引号。例如,“UPDATE urls SET state=%d,content='bla' .'...”。这个时候,爬取的内容将会有单引号的文本,因为我是从网页上爬取的内容。谢谢你。 - Moon Taejin

0
尝试将环境变量“PYTHONIOENCODING”更改为“utf_8”。如果您不想导出它,可以像这样操作:
PYTHONIOENCODING=utf-8 python myproject.py
此外,您必须使用u""字符串。

0

由于您将 MySql 命令生成为字符串,您需要这些字符串成为 Unicode 字符串,请尝试将所有 cursor.execute(" 行更改为 cursor.execute(u"


首先,非常感谢您的问题。但是我已经尝试了您的建议,但仍然无效。出现相同的错误信息。 - Moon Taejin

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接