一个用python的scrapy和sqlalchemy的菜鸟级数据挖掘

By | 2014/05/16

准备工作:
配置环境,
系统:Ubuntu 12.04 stl
python版本:2.7.5
scrapy版本:0.22
sqlalchemy版本:0.8.5
数据库版本:MySQL 5.5
第一步,我们来确定要爬取的网站的链接:
1.腾讯国内新闻http://news.qq.com/china_index.shtml
2.凤凰公益新闻http://gongyi.ifeng.com/rt-channel/rtlist_0/index.shtml
3.新浪娱乐新闻http://ent.sina.com.cn/star/
4.腾讯国际新闻http://news.qq.com/world_index.shtml
5.中国新闻网军事板块http://www.chinanews.com/mil/news.shtml
6.中国新闻网体育板块http://www.chinanews.com/sports.shtml
7.百思不得其解http://www.budejie.com/duanzi/
第二步,我们来确定数据库的结构:
我们创建一个名为spider的数据库,其中共有如下表:
chinanewscontent国内新闻内容
chinanewtitle国内新闻标题及链接

communitycontent公益新闻内容
communitytitle公益新闻标题及链接

entertainmentcontent娱乐新闻内容
entertainmenttitle娱乐新闻标题及链接

internewscontent国际新闻内容
internewstitle国际新闻标题及链接

jokecontent笑话

militarycontent军事新闻内容
militarytitle军事新闻标题及链接

sportscontent体育新闻内容
sportstitle体育新闻标题及链接

第三步,创建一个python项目:
打开终端,输入如下命令:

cd 桌面
mkdir myspider
cd myspider
scrapy startproject karlspider

接下来就可以编辑这个项目了,编辑器建议使用pycharm,功能很强大,当然,也可以使用vim,这个无要求。
第四步,编辑model层:
按照分层架构的思想,我们将整个项目分为controler层和model层两大部分,我们先编辑model层,
在这里,我用pycharm做开发,导入这个项目后,在karlspider/karlspider路径下创建data_manage包,包内共有如下
文件:data_settings.py和data_option.py
代码如下:
data_settings.py

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String
from sqlalchemy.engine import create_engine
from sqlalchemy.dialects.mysql import LONGTEXT

'''the connection to the database'''
'''
for example:
engine = create_engine("mysql://karlname:karlpassword@localhost/spider?charset=utf8",isolation_level="READ UNCOMMITTED")
'''
engine = create_engine("mysql://"此处数据库用户名":"此处密码"@"数据库服务地址"/"数据库名"?charset=utf8",isolation_level="READ UNCOMMITTED") 
'''the construct of the database table'''
Base = declarative_base()

class NewsTitle():
    id = Column(Integer, primary_key=True)
    title = Column(String(50))
    link = Column(String(500))
    lock = Column(String(1))
    
class NewsContent():
    id = Column(Integer,primary_key=True)
    title = Column(String(50))
    content = Column(LONGTEXT)
    
class Joke():
    id = Column(Integer, primary_key=True)
    content = Column(LONGTEXT)
    
class User():
    id = Column(Integer,primary_key=True)
    username = Column(String(500))
    password = Column(String(500))
    malitary = Column(String(500))
    chinanews = Column(String(500))
    internews = Column(String(500))
    community = Column(String(500))
    
class ChinaNewsTitle(Base,NewsTitle):
    __tablename__ = 'chinanewstitle'
    
class ChinaNewsContent(Base,NewsContent):
    __tablename__ = 'chinanewscontent'
    
class InterNewsTitle(Base,NewsTitle):
    __tablename__ = 'internewstitle'
    
class InterNewsContent(Base,NewsContent):
    __tablename__ = 'internewscontent'

class CommunityTitle(Base,NewsTitle):
    __tablename__ = 'communitytitle'
    
class CommunityContent(Base,NewsContent):
    __tablename__ = 'communitycontent'
    
class MalitaryTitle(Base,NewsTitle):
    __tablename__ = 'malitarytitle'
    
class MalitaryContent(Base,NewsContent):
    __tablename__ = 'malitarycontent'
    
class JokeContent(Base,Joke):
    __tablename__ = 'jokecontent'
    
class SportsTitle(Base,NewsTitle):
    __tablename__ = 'sportstitle'
    
class SportsContent(Base,NewsContent):
    __tablename__ = 'sportscontent'
    
class EntertainmentTitle(Base,NewsTitle):
    __tablename__ = 'entertainmenttitle'
    
class EntertainmentContent(Base,NewsContent):
    __tablename__ = 'entertainmentcontent'
    
Base.metadata.create_all(engine)

data_option.py

'''
this model contain the methods you can use the options:
add information in database
get information in database
'''

from sqlalchemy.orm.session import sessionmaker
from data_settings import ChinaNewsContent,ChinaNewsTitle,CommunityContent,CommunityTitle,\
                    InterNewsContent,InterNewsTitle,JokeContent,engine,MalitaryTitle,MalitaryContent,\
                    SportsTitle,SportsContent,EntertainmentTitle,EntertainmentContent
'''
insert the information into database
'''
def Session_Adapter_Add(data_information):
    Session = sessionmaker()
    Session.configure(bind=engine)
    session = Session()
    session.add(data_information)
    session.flush()
    session.commit()
'''
return a session
'''    
def Get_Session():
    Session = sessionmaker()
    Session.configure(bind=engine)
    session = Session()
    return session

class DataOptionSave():
    '''
    use this method to save the joke content in database
    '''
    def Joke_Save(self,contents):
        jokeContent = JokeContent()
        jokeContent.content = contents
        Session_Adapter_Add(jokeContent)
        
    def Chnews_Title_Save(self,s,t):
        li = eval(s)
        lis = eval(t)
        for i in range(len(li)):
            chNewsTitle = ChinaNewsTitle()
            chNewsTitle.link = "http://news.qq.com"+li[len(li)-1-i]
            chNewsTitle.title = lis[len(li)-1-i]
            chNewsTitle.lock = 0
            Session_Adapter_Add(chNewsTitle)
            
    def Sport_Title_Save(self,s,t):
        for i in range(len(s)):
            SportTitle = SportsTitle()
            SportTitle.link = "http://www.chinanews.com"+s[len(s)-1-i]
            SportTitle.title = t[len(s)-1-i]
            SportTitle.lock = 0
            Session_Adapter_Add(SportTitle)
        
    def Chnews_Content_Save(self,title,content):
        chNewsContent = ChinaNewsContent()
        chNewsContent.title = title
        chNewsContent.content = content
        Session_Adapter_Add(chNewsContent)
        
    def Sport_Content_Save(self,title,content):
        sportContent = SportsContent()
        sportContent.title = title
        sportContent.content = content
        Session_Adapter_Add(sportContent)
        
    def Inter_Title_Save(self,s,t):
        li = eval(s)
        lis = eval(t)
        for i in range(len(li)):
            interNewsTitle = InterNewsTitle()
            interNewsTitle.link = "http://news.qq.com"+li[len(li)-1-i]
            interNewsTitle.title = lis[len(li)-1-i]
            interNewsTitle.lock = 0
            Session_Adapter_Add(interNewsTitle)
        
    def Inter_Content_Save(self,title,content):
        interContent = InterNewsContent()
        interContent.title = title
        interContent.content = content
        Session_Adapter_Add(interContent)
        
    def Comu_Title_Save(self,s,t):
        for i in range(len(s)):
            comuNewsTitle = CommunityTitle()
            comuNewsTitle.link = s[len(s)-1-i]
            comuNewsTitle.title = t[len(s)-1-i]
            comuNewsTitle.lock = 0
            Session_Adapter_Add(comuNewsTitle)
            
    def Entertainment_Title_Save(self,s,t):
        for i in range(len(s)):
            comuNewsTitle = EntertainmentTitle()
            comuNewsTitle.link = s[len(s)-1-i]
            comuNewsTitle.title = t[len(s)-1-i]
            comuNewsTitle.lock = 0
            Session_Adapter_Add(comuNewsTitle)
        
    def Comu_Content_Save(self,title,content):
        comuContent = CommunityContent()
        comuContent.title = title
        comuContent.content = content
        Session_Adapter_Add(comuContent)
        
    def Entertainment_Content_Save(self,title,content):
        comuContent = EntertainmentContent()
        comuContent.title = title
        comuContent.content = content
        Session_Adapter_Add(comuContent)
        
    def Mali_Title_Save(self,s,t):
        for i in range(len(s)):
            malitaryTitle = MalitaryTitle()
            if "http://www.chinanews.com" in s[len(s)-1-i]:
                malitaryTitle.link = s[len(s)-1-i]
            else:
                malitaryTitle.link = "http://www.chinanews.com"+s[len(s)-1-i]
            malitaryTitle.title = t[len(s)-1-i]
            malitaryTitle.lock = 0
            Session_Adapter_Add(malitaryTitle)
            
    def Mali_Content_Save(self,title,content):
        maliContent = MalitaryContent()
        maliContent.title = title
        maliContent.content = content
        Session_Adapter_Add(maliContent)
        
class DataOptionGet():
    '''
    get the number message
    '''
    def get_title_num(self,table_name):
        session = Get_Session()
        num = 0
        if table_name == 'joke':
            missing = session.query(JokeContent).all()
            num = 10*len(missing)
        elif table_name == 'china':
            missing = session.query(ChinaNewsTitle).all()
            num = len(missing)
        elif table_name == 'internation':
            missing = session.query(InterNewsTitle).all()
            num = len(missing)
        elif table_name == 'sports':
            missing = session.query(SportsTitle).all()
            num = len(missing)
        elif table_name == 'malitary':
            missing = session.query(MalitaryTitle).all()
            num = len(missing)
        elif table_name == 'community':
            missing = session.query(CommunityTitle).all()
            num = len(missing)
        elif table_name == 'entertainment':
            missing = session.query(EntertainmentTitle).all()
            num = len(missing)
        return num
    '''
    get the message in the table
    '''
    def get_table_message(self,table_name):
        session = Get_Session()
        if table_name == 'joke':
            missing = session.query(JokeContent).all()
        elif table_name == 'china':
            missing = session.query(ChinaNewsTitle).all()
        elif table_name == 'internation':
            missing = session.query(InterNewsTitle).all()
        elif table_name == 'sport':
            missing = session.query(SportsTitle).all()
        elif table_name == 'malitary':
            missing = session.query(MalitaryTitle).all()
        elif table_name == 'community':
            missing = session.query(CommunityTitle).all()
        elif table_name == 'entertainment':
            missing = session.query(EntertainmentTitle).all()
        return missing
    '''
    get unlock links and last link from ChinaNewsTitle
    lock the link used in ChinaNewsTitle
    '''
    def get_Unlock_Links_CH(self):
        session = Get_Session()
        missing = session.query(ChinaNewsTitle).filter(ChinaNewsTitle.lock=="0").all()
        return missing
    def get_Last_Link_CH(self):
        session = Get_Session()
        try:
            missing = session.query(ChinaNewsTitle).order_by(ChinaNewsTitle.id.desc()).first()
            return missing.link
        except:
            return ''
    def lock_Links_CH(self):
        session = Get_Session()
        session.query(ChinaNewsTitle).filter(ChinaNewsTitle.lock=='0').update({ChinaNewsTitle.lock:'1'})
        session.flush()
        session.commit()
    '''
    get unlock links and last link from InterNewsTitle
    lock the link used in InterNewsTitle
    '''    
    def get_Unlock_Links_IN(self):
        session = Get_Session()
        missing = session.query(InterNewsTitle).filter(InterNewsTitle.lock=="0").all()
        return missing
    def get_Last_Link_IN(self):
        session = Get_Session()
        try:
            missing = session.query(InterNewsTitle).order_by(InterNewsTitle.id.desc()).first()
            return missing.link
        except:
            return ''
    def lock_Links_IN(self):
        session = Get_Session()
        session.query(InterNewsTitle).filter(InterNewsTitle.lock=='0').update({InterNewsTitle.lock:'1'})
        session.flush()
        session.commit()
    '''
    get unlock links and last link from CommunityTitle
    lock the link used in CommunityTitle
    '''
    def get_Unlock_Links_CO(self):
        session = Get_Session()
        missing = session.query(CommunityTitle).filter(CommunityTitle.lock=="0").all()
        return missing
    def get_Last_Link_CO(self):
        session = Get_Session()
        try:
            missing = session.query(CommunityTitle).order_by(CommunityTitle.id.desc()).first()
            return missing.link
        except:
            return ''
    def lock_Links_CO(self):
        session = Get_Session()
        session.query(CommunityTitle).filter(CommunityTitle.lock=='0').update({CommunityTitle.lock:'1'})
        session.flush()
        session.commit()
    '''
    get unlock links and last link from MalitaryTitle
    lock the link used in MalitaryTitle
    '''
    def get_Unlock_Links_MA(self):
        session = Get_Session()
        missing = session.query(MalitaryTitle).filter(MalitaryTitle.lock=="0").all()
        return missing
    def get_Last_Link_MA(self):
        session = Get_Session()
        try:
            missing = session.query(MalitaryTitle).order_by(MalitaryTitle.id.desc()).first()
            return missing.link
        except:
            return ''
    def lock_Links_MA(self):
        session = Get_Session()
        session.query(MalitaryTitle).filter(MalitaryTitle.lock=='0').update({MalitaryTitle.lock:'1'})
        session.flush()
        session.commit()
    '''
    get unlock links and last link from SportsTitle
    lock the link used in SportsTitle
    '''
    def get_Unlock_Links_Sport(self):
        session = Get_Session()
        missing = session.query(SportsTitle).filter(SportsTitle.lock=="0").all()
        return missing
    def get_Last_Link_Sport(self):
        session = Get_Session()
        try:
            missing = session.query(SportsTitle).order_by(SportsTitle.id.desc()).first()
            return missing.link
        except:
            return ''
    def lock_Links_Sport(self):
        session = Get_Session()
        session.query(SportsTitle).filter(SportsTitle.lock=='0').update({SportsTitle.lock:'1'})
        session.flush()
        session.commit()
        
    '''
    get unlock links and last link from EntertainmentTitle
    lock the link used in SportsTitle
    '''
    def get_Unlock_Links_ET(self):
        session = Get_Session()
        missing = session.query(EntertainmentTitle).filter(EntertainmentTitle.lock=="0").all()
        return missing
    def get_Last_Link_ET(self):
        session = Get_Session()
        try:
            missing = session.query(EntertainmentTitle).order_by(EntertainmentTitle.id.desc()).first()
            return missing.link
        except:
            return ''
    def lock_Links_ET(self):
        session = Get_Session()
        session.query(SportsTitle).filter(EntertainmentTitle.lock=='0').update({EntertainmentTitle.lock:'1'})
        session.flush()
        session.commit()        
        
        
        

第五步,编辑controler层:
在karlspider/spider文件夹下创建爱如下文件:
china_spider.py community_spider.py crawl_api.py entertainment_spider.py inter_spider.py joke_spider.py military_spider.py sports_spider.py

china_spider.py

#coding=utf-8
'''
Created on 2014年1月25日

@author: karl
'''
import crawl_api
from karlspider.data_manage.data_option import DataOptionGet
from scrapy.spider import BaseSpider

class ChinaSpider(BaseSpider):
    name = "chinaspider.org"
    allowed_domains = ["chinaspider.org"]
    start_urls = ['http://news.qq.com/china_index.shtml']
    def parse(self, response):
        crawl_api.Crawl_China_Title(response)
        
        
class ChinaContentSpider(BaseSpider):
    name = "chinacontentspider.org"
    allowed_domains = ["chinacontentspider.org"]
    start_urls = []
    dataoptionget = DataOptionGet()
    mission = dataoptionget.get_Unlock_Links_CH()
    num = len(mission)
    for i in range(num):
        start_urls.append(mission[i].link)
    def parse(self,response):
        crawl_api.Crawl_China_Content(response)
        

community_spider.py

#coding=utf-8
'''
Created on 2014年1月25日

@author: karl
'''
import crawl_api
from karlspider.data_manage.data_option import DataOptionGet
from scrapy.spider import BaseSpider

class CommunitySpider(BaseSpider):
    name = "communityspider.org"
    allowed_domains = ["communityspider.org"]
    start_urls = ['http://gongyi.ifeng.com/rt-channel/rtlist_0/index.shtml']
    def parse(self, response):
        crawl_api.Crawl_Community_Title(response)
        
        
class CommunityContentSpider(BaseSpider):
    name = "communitycontentspider.org"
    allowed_domains = ["communitycontentspider.org"]
    start_urls = []
    dataoptionget = DataOptionGet()
    mission = dataoptionget.get_Unlock_Links_CO()
    num = len(mission)
    for i in range(num):
        start_urls.append(mission[i].link)
    def parse(self,response):
        crawl_api.Crawl_Community_Content(response)
    

crawl_api.py

#coding=utf-8
'''
use this moudle to crawl the title and content links
'''
from karlspider.items import DmozItem
from scrapy.selector import HtmlXPathSelector
from karlspider.data_manage.data_option import DataOptionSave,DataOptionGet
'''
crawl the content's link and content's title from tencent chinanews 
save the content's link and title in database
'''
def Crawl_China_Title(response):
    _title = []
    _link = []
    item = DmozItem()
    hxs = HtmlXPathSelector(response)
    sites = hxs.select('//a')
    dataOptionGet = DataOptionGet()
    latest_link = dataOptionGet.get_Last_Link_CH()
    for site in sites:
        item['title'] = site.select("//a[@target='_blank' and @class='linkto']/text()").extract()
        item['link'] = site.select("//a[@target='_blank' and @class='linkto']/@href").extract()
        break
    for i in range(len(item['title'])):
        if "http://news.qq.com"+str(item['link'][i]) == latest_link:
            break
        _title.append(item['title'][i])
        _link.append(item['link'][i])
    if len(_link) > 0:
        dataOptionSave = DataOptionSave()
        dataOptionSave.Chnews_Title_Save(str(_link), str(_title))

'''
crawl the content's link and content's title from tencent internationalnews 
save the content's link and title in database
'''
def Crawl_Inter_Title(response):
    _title = []
    _link = []
    item = DmozItem()
    hxs = HtmlXPathSelector(response)
    sites = hxs.select('//a')
    dataOptionGet = DataOptionGet()
    latest_link = dataOptionGet.get_Last_Link_IN()
    for site in sites:
        item['title'] = site.select("//a[@target='_blank' and @class='linkto']/text()").extract()
        item['link'] = site.select("//a[@target='_blank' and @class='linkto']/@href").extract()
        break
    for i in range(len(item['title'])):
        if "http://news.qq.com"+str(item['link'][i]) == latest_link:
            break
        _title.append(item['title'][i])
        _link.append(item['link'][i])
    if len(_link) > 0:
        dataOptionSave = DataOptionSave()
        dataOptionSave.Inter_Title_Save(str(_link), str(_title))

'''
this method crawl the joke from budejie

first select all the tag <p> 
then select all the content which tag is <p> and tag's element is "class='web_size'"

save the content in a list
convert the list into string

save the string in database

'''
def Crawl_Joke_Content(response):
    dataOptionSave = DataOptionSave()
    jokecontent = []
    item = DmozItem()
    hxs = HtmlXPathSelector(response)
    sites = hxs.select('//p')
    for site in sites:
        item['desc'] = site.select("//p[@class='web_size']").extract()
    for i in range(len(item['desc'])):
        jokecontent.append(item['desc'][i])
    jokecontent = str(jokecontent)
    dataOptionSave.Joke_Save(jokecontent)
    jokecontent = eval(jokecontent)

'''
use this method to crawl information from tencent
first select all tag <div>
then select the <div> which element is "id='Cnt-Article-QQ'"

save the information in a list
convert the list into a string
save the string in database
'''
def Crawl_China_Content(response):
    newscontent = []
    hxs = HtmlXPathSelector(response)
    title = hxs.select("//h1/text()")[0].extract()
    sites = hxs.select("//div")
    divs = sites.select("//div[@id=\"Cnt-Main-Article-QQ\"]")
    for p in divs[0].select(".//p"): # extracts all <p> inside
        newscontent.append(p.extract())
    newscontent = str(newscontent)
    dataOptionSave = DataOptionSave()
    dataOptionSave.Chnews_Content_Save(title, newscontent)

'''
use this method to crawl information from tencent
first select all tag <div>
then select the <div> which element is "id='Cnt-Article-QQ'"

save the information in a list
convert the list into a string
save the string in database
'''
def Crawl_Inter_Content(response):
    newscontent = []
    hxs = HtmlXPathSelector(response)
    title = hxs.select("//h1/text()")[0].extract()
    sites = hxs.select("//div")
    divs = sites.select("//div[@id=\"Cnt-Main-Article-QQ\"]")
    for p in divs[0].select(".//p"): # extracts all <p> inside
        newscontent.append(p.extract())
    newscontent = str(newscontent)
    dataOptionSave = DataOptionSave()
    dataOptionSave.Inter_Content_Save(title, newscontent)

'''
crawl the titles and links from fenghuang community
save the titles and links in database
'''
def Crawl_Community_Title(response):
    _title = []
    _link = []
    item = DmozItem()
    hxs = HtmlXPathSelector(response)
    item['title'] = hxs.select("//div[@class=\"newsList\"]//ul//li//a/text()").extract()
    item['link'] = hxs.select("//div[@class=\"newsList\"]//ul//li//a/@href").extract()
    dataOptionGet = DataOptionGet()
    latest_link = dataOptionGet.get_Last_Link_CO()
    for i in range(len(item['title'])):
        if str(item['link'][i]) == latest_link:
            break
        _title.append(item['title'][i])
        _link.append(item['link'][i])
    if len(_link) > 0:
        dataOptionSave = DataOptionSave()
        dataOptionSave.Comu_Title_Save(_link,_title) 

'''
crawl the community content from fenghuang community

first crawl all the tag <div>
the select the <div> which element is id="main_content"
'''
def Crawl_Community_Content(response):
    newscontent = []
    hxs = HtmlXPathSelector(response)
    title = hxs.select("//h1/text()")[0].extract()
    sites = hxs.select("//div")
    divs = sites.select("//div[@id=\"main_content\" and @class=\"js_selection_area\"]")
    try:
        for p in divs[0].select(".//p"): # extracts all <p> inside
            newscontent.append(p.extract())
        newscontent = str(newscontent)
        dataOptionSave = DataOptionSave()
        dataOptionSave.Comu_Content_Save(title, newscontent)
    except:
        pass
    
'''
crawl title military news titles and links from sina
save the titles and links in database
'''
def Crawl_Malitary_Title(response):
    _title = []
    _link = []
    item = DmozItem()
    hxs = HtmlXPathSelector(response)
    item['title'] = hxs.select("//div[@class='dd_bt']//a/text()").extract()
    item['link'] = hxs.select("//div[@class='dd_bt']//a/@href").extract()
    dataOptionGet = DataOptionGet()
    latest_link = dataOptionGet.get_Last_Link_MA()
    for i in range(len(item['title'])):
        if "http://www.chinanews.com"+str(item['link'][i]) == latest_link:
            break
        _title.append(item['title'][i])
        _link.append(item['link'][i])
    if len(_link) > 0:
        dataOptionSave = DataOptionSave()
        dataOptionSave.Mali_Title_Save(_link,_title) 

'''
crawl the military contents from sina
select all tag <div> from sina
select all tag <div> which elements are class='blkContainerSblkCon' and id='artibody'
select all tag <p> from <div>
save the content in a list
convert the list into string 
save the string in database
'''        
def Crawl_Malitary_Content(response):
    newscontent = []
    hxs = HtmlXPathSelector(response)
    title = hxs.select("//title/text()")[0].extract()
    title = title.split('-')[0]
    sites = hxs.select("//div")
    divs = sites.select("//div[@class=\"left_zw\"]")
    try:
        for p in divs[0].select(".//p"): # extracts all <p> inside
            newscontent.append(p.extract())
        newscontent = str(newscontent)
        dataOptionSave = DataOptionSave()
        dataOptionSave.Mali_Content_Save(title, newscontent)
    except:
        pass
'''
crawl the Sports News's titles and links from http://www.chinanews.com
'''
def Crawl_Sport_Title(response):
    _title = []
    _link = []
    item = DmozItem()
    hxs = HtmlXPathSelector(response)
    item['title'] = hxs.select("//div[@class='dd_bt']//a/text()").extract()
    item['link'] = hxs.select("//div[@class='dd_bt']//a/@href").extract()
    dataOptionGet = DataOptionGet()
    latest_link = dataOptionGet.get_Last_Link_Sport()
    for i in range(len(item['title'])):
        if "http://www.chinanews.com"+str(item['link'][i]) == latest_link:
            break
        _title.append(item['title'][i])
        _link.append(item['link'][i])
    if len(_link) > 0:
        dataOptionSave = DataOptionSave()
        dataOptionSave.Sport_Title_Save(_link,_title)    

'''
crawl the Sports News from http://www.chinanews.com
'''
def Crawl_Sport_Content(response):
    newscontent = []
    hxs = HtmlXPathSelector(response)
    title = hxs.select("//title/text()")[0].extract()
    title = title.split('-')[0]
    sites = hxs.select("//div")
    divs = sites.select("//div[@class=\"left_zw\"]")
    for p in divs[0].select(".//p"): # extracts all <p> inside
        newscontent.append(p.extract())
    newscontent = str(newscontent)
    dataOptionSave = DataOptionSave()
    dataOptionSave.Sport_Content_Save(title, newscontent)
    
'''
Crawl Entertainment from sina
'''    
def Crawl_Entertainment_Title(response):
    _title = []
    _link = []
    item = DmozItem()
    hxs = HtmlXPathSelector(response)
    item['title'] = hxs.select("//div[@class=\"news-item  img-news-item\"]//h2//a/text()").extract()
    item['link'] = hxs.select("//div[@class=\"news-item  img-news-item\"]//h2//a/@href").extract()
    dataOptionGet = DataOptionGet()
    latest_link = dataOptionGet.get_Last_Link_ET()
    for i in range(len(item['title'])):
        if str(item['link'][i]) == latest_link:
            break
        _title.append(item['title'][i])
        _link.append(item['link'][i])
    if len(_link) > 0:
        dataOptionSave = DataOptionSave()
        dataOptionSave.Entertainment_Title_Save(_link,_title) 
    
'''
crawl the 
'''
def Crawl_Entertainment_Content(response):
    newscontent = []
    hxs = HtmlXPathSelector(response)
    title = hxs.select("//title/text()")[0].extract()
    title = title.split('|')[0]
    sites = hxs.select("//div")
    try:
        divs = sites.select("//div[@class=\"blkContainerSblkCon BSHARE_POP clearfix\" and @id=\"artibody\"]")
        for p in divs[0].select(".//p"): # extracts all <p> inside
            newscontent.append(p.extract())
        newscontent = str(newscontent)
        dataOptionSave = DataOptionSave()
        dataOptionSave.Entertainment_Content_Save(title, newscontent)
    except:
        pass

entertainment_spider.py

#coding=utf-8
'''
Created on 2014年1月25日

@author: karl
'''
import crawl_api
from karlspider.data_manage.data_option import DataOptionGet
from scrapy.spider import BaseSpider

class EntertainmentSpider(BaseSpider):
    name = "entertainmentspider.org"
    allowed_domains = ["entertainmentspider.org"]
    start_urls = ['http://ent.sina.com.cn/star/']
    def parse(self, response):
        crawl_api.Crawl_Entertainment_Title(response)
        
        
class EntertainmentContentSpider(BaseSpider):
    name = "entertainmentcontentspider.org"
    allowed_domains = ["entertainmentcontentspider.org"]
    start_urls = []
    dataoptionget = DataOptionGet()
    mission = dataoptionget.get_Unlock_Links_ET()
    num = len(mission)
    for i in range(num):
        start_urls.append(mission[i].link)
    def parse(self,response):
        crawl_api.Crawl_Entertainment_Content(response)

inter_spider.py

#coding=utf-8
'''
Created on 2014年1月25日

@author: karl
'''
import crawl_api
from karlspider.data_manage.data_option import DataOptionGet
from scrapy.spider import BaseSpider

class InterSpider(BaseSpider):
    name = "interspider.org"
    allowed_domains = ["interspider.org"]
    start_urls = ['http://news.qq.com/world_index.shtml']
    def parse(self, response):
        crawl_api.Crawl_Inter_Title(response)
        
        
class InterContentSpider(BaseSpider):
    name = "intercontentspider.org"
    allowed_domains = ["intercontentspider.org"]
    start_urls = []
    dataoptionget = DataOptionGet()
    mission = dataoptionget.get_Unlock_Links_IN()
    num = len(mission)
    for i in range(num):
        start_urls.append(mission[i].link)
    def parse(self,response):
        crawl_api.Crawl_Inter_Content(response)
    

joke_spider.py

#coding=utf-8
'''
Created on 2014年1月25日

@author: karl
'''
import crawl_api
from scrapy.spider import BaseSpider

class JokeSpider(BaseSpider):
    name = "jokespider.org"
    allowed_domains = ["jokespider.org"]
    start_urls = ['http://www.budejie.com/duanzi/']
#     for i in range(2000):
#         start_urls.append('http://www.budejie.com/xcs.php?page='+str(i)+'&maxid=1381126801')
    def parse(self, response):
        crawl_api.Crawl_Joke_Content(response)

military_spider.py

#coding=utf-8
'''
Created on 2014年1月25日

@author: karl
'''
import crawl_api
from karlspider.data_manage.data_option import DataOptionGet
from scrapy.spider import BaseSpider

class MaliSpider(BaseSpider):
    name = "malispider.org"
    allowed_domains = ["malispider.org"]
    start_urls = ['http://www.chinanews.com/mil/news.shtml']
    def parse(self, response):
        crawl_api.Crawl_Malitary_Title(response)
        
        
class MaliContentSpider(BaseSpider):
    name = "malicontentspider.org"
    allowed_domains = ["malicontentspider.org"]
    start_urls = []
    dataoptionget = DataOptionGet()
    mission = dataoptionget.get_Unlock_Links_MA()
    num = len(mission)
    for i in range(num):
        start_urls.append(mission[i].link)
    def parse(self,response):
        crawl_api.Crawl_Malitary_Content(response)

sports_spider.py

#coding=utf-8
'''
Created on 2014年1月25日

@author: karl
'''
import crawl_api
from karlspider.data_manage.data_option import DataOptionGet
from scrapy.spider import BaseSpider

class SportSpider(BaseSpider):
    name = "sportspider.org"
    allowed_domains = ["sportspider.org"]
    start_urls = ['http://www.chinanews.com/sports.shtml']
    def parse(self, response):
        crawl_api.Crawl_Sport_Title(response)
        
        
class SportContentSpider(BaseSpider):
    name = "sportcontentspider.org"
    allowed_domains = ["sportcontentspider.org"]
    start_urls = []
    dataoptionget = DataOptionGet()
    mission = dataoptionget.get_Unlock_Links_Sport()
    num = len(mission)
    for i in range(num):
        start_urls.append(mission[i].link)
    def parse(self,response):
        crawl_api.Crawl_Sport_Content(response)

其实此刻这个项目基本上就完毕了,不妨我们再写一个命令,在karlspider/karlspider路径下创建一个command.py

#coding=utf-8
'''
Created on 2014年1月27日

@author: karl
'''
import os
from data_manage.data_option import DataOptionGet

path = os.getcwd()
def joke():
    os.popen('cd '+path)
    os.popen('scrapy crawl jokespider.org')
def china():
    os.popen('cd '+path)
    os.popen('scrapy crawl chinaspider.org')
    os.popen('scrapy crawl chinacontentspider.org')
    dataoptionget = DataOptionGet()
    dataoptionget.lock_Links_CH()
def internation():
    os.popen('cd '+path)
    os.popen('scrapy crawl interspider.org')
    os.popen('scrapy crawl intercontentspider.org')
    dataoptionget = DataOptionGet()
    dataoptionget.lock_Links_IN()
def sport():
    os.popen('cd '+path)
    os.popen('scrapy crawl sportspider.org')
    os.popen('scrapy crawl sportcontentspider.org')
    dataoptionget = DataOptionGet()
    dataoptionget.lock_Links_Sport()
def malitary():
    os.popen('cd '+path)
    os.popen('scrapy crawl malispider.org')
    os.popen('scrapy crawl malicontentspider.org')
    dataoptionget = DataOptionGet()
    dataoptionget.lock_Links_MA()
def community():
    os.popen('cd '+path)
    os.popen('scrapy crawl communityspider.org')
    os.popen('scrapy crawl communitycontentspider.org')
    dataoptionget = DataOptionGet()
    dataoptionget.lock_Links_CO()
def entertainment():
    os.popen('cd '+path)
    os.popen('scrapy crawl entertainmentspider.org')
    os.popen('scrapy crawl entertainmentcontentspider.org')
    dataoptionget = DataOptionGet()
    dataoptionget.lock_Links_ET()

def terminal():
    dataoptionget = DataOptionGet()
    print 'data mining start...'
    print 'joke china internation sports'
    print 'malitary community entertainment'
    while True:
        command = raw_input('input command:').strip().rstrip().split(' ')
        if command[0] == 'exit':
            break
        if not command[0] == 'runcrawler' and not command[0] == 'show' and not command[0] == 'shownum':
            print 'command error'
        elif command[0] == 'runcrawler':
            if command[1] == 'all':
                joke()
                china()
                internation()
                sport()
                malitary()
                community()
                entertainment()
            else:
                for i in range(len(command)):
                    if command[i] == 'joke':
                        joke()
                    if command[i] == 'china':
                        china()
                    if command[i] == 'internation':
                        internation()
                    if command[i] == 'sports':
                        sport()
                    if command[i] == 'malitary':
                        malitary()
                    if command[i] == 'community':
                        community()
                    if command[i] == 'entertainment':
                        entertainment()
        elif command[0] == 'show':
            for i in range(len(command)):
                    if command[i] == 'china':
                        mission = dataoptionget.get_table_message('china')
                        for j in range(len(mission)):
                            print str(j)+'.'+mission[j].title
                    if command[i] == 'internation':
                        mission = dataoptionget.get_table_message('internation')
                        for j in range(len(mission)):
                            print str(j)+'.'+mission[j].title
                    if command[i] == 'sports':
                        mission = dataoptionget.get_table_message('sport')
                        for j in range(len(mission)):
                            print str(j)+'.'+mission[j].title
                    if command[i] == 'malitary':
                        mission = dataoptionget.get_table_message('malitary')
                        for j in range(len(mission)):
                            print str(j)+'.'+mission[j].title
                    if command[i] == 'community':
                        mission = dataoptionget.get_table_message('community')
                        for j in range(len(mission)):
                            print str(j)+'.'+mission[j].title
                    if command[i] == 'entertainment':
                        mission = dataoptionget.get_table_message('entertainment')
                        for j in range(len(mission)):
                            print str(j)+'.'+mission[j].title
        elif command[0] == 'shownum':
            if command[1] == 'all':
                print 'joke:'+str(dataoptionget.get_title_num('joke'))
                print 'china:'+str(dataoptionget.get_title_num('china'))
                print 'internation:'+str(dataoptionget.get_title_num('internation'))
                print 'sports:'+str(dataoptionget.get_title_num('sports'))
                print 'malitary:'+str(dataoptionget.get_title_num('malitary'))
                print 'community:'+str(dataoptionget.get_title_num('community'))
                print 'entertainment:'+str(dataoptionget.get_title_num('entertainment'))
            else:
                for i in range(len(command)):
                    if command[i] == 'joke':
                        print 'joke:'+str(dataoptionget.get_title_num('joke'))
                    if command[i] == 'china':
                        print 'china:'+str(dataoptionget.get_title_num('china'))
                    if command[i] == 'internation':
                        print 'internation:'+str(dataoptionget.get_title_num('internation'))
                    if command[i] == 'sports':
                        print 'sports:'+str(dataoptionget.get_title_num('sports'))
                    if command[i] == 'malitary':
                        print 'malitary:'+str(dataoptionget.get_title_num('malitary'))
                    if command[i] == 'community':
                        print 'community:'+str(dataoptionget.get_title_num('community'))
                    if command[i] == 'entertainment':
                        print 'entertainment:'+str(dataoptionget.get_title_num('entertainment'))
        
if __name__ == '__main__':
    terminal()
    

好了,现在一个爬虫就写完了,不过scrapy和sqlalchemy都是重量级框架,在这里我们使用了增量爬取的方法,
以后我将在博客里在写一个项目,用来发布我们爬取的信息,OK。

发表评论

电子邮件地址不会被公开。 必填项已用*标注

This site uses Akismet to reduce spam. Learn how your comment data is processed.