zhihu.author 源代码

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import json

from .base import BaseZhihu
from .common import *


class BanException(Exception):
    """当尝试获取被反屏蔽系统限制的用户资料时，将会引发此异常"""
    pass


[文档]class Author(BaseZhihu):
    """用户类，请使用``ZhihuClient.answer``方法构造对象."""

    @class_common_init(re_author_url, True)
[文档]    def __init__(self, url, name=None, motto=None, follower_num=None,
                 question_num=None, answer_num=None, upvote_num=None,
                 thank_num=None, photo_url=None, session=None):
        """创建用户类实例.

        :param str url: 用户主页url，形如 http://www.zhihu.com/people/7sdream
        :param str name: 用户名字，可选
        :param str motto: 用户简介，可选
        :param int follower_num: 用户粉丝数，可选
        :param int question_num: 用户提问数，可选
        :param int answer_num: 用户答案数，可选
        :param int upvote_num: 用户获得赞同数，可选
        :param int thank_num: 用户获得感谢数，可选
        :param str photo_url: 用户头像地址，可选
        :param Session session: 使用的网络会话，为空则使用新会话。
        :return: 用户对象
        :rtype: Author
        """
        self.url = url
        self._session = session
        self.card = None
        self._nav_list = None
        self._name = name
        self._motto = motto
        self._follower_num = follower_num
        self._question_num = question_num
        self._answer_num = answer_num
        self._upvote_num = upvote_num
        self._thank_num = thank_num
        self._photo_url = photo_url

    def _gen_soup(self, content):
        self.soup = BeautifulSoup(content)
        ban_title = self.soup.find("div", class_="ProfileBan-title")
        if ban_title is not None:
            raise BanException(ban_title.text)
        self._nav_list = self.soup.find(
            'div', class_='profile-navbar').find_all('a')

    def _make_card(self):
        if self.card is None and self.url is not None:
            params = {'url_token': self.id}
            real_params = {'params': json.dumps(params)}
            r = self._session.get(Get_Profile_Card_URL, params=real_params)
            self.card = BeautifulSoup(r.content)

    @property
    def id(self):
        """获取用户id，就是网址最后那一部分.

        :return: 用户id
        :rtype: str
        """
        return re.match(r'^.*/([^/]+)/$', self.url).group(1) \
            if self.url is not None else ''

    @property
    @check_soup('_xsrf')
    def xsrf(self):
        """获取知乎的反xsrf参数（用不到就忽视吧~）

        :return: xsrf参数
        :rtype: str
        """
        return self.soup.find('input', attrs={'name': '_xsrf'})['value']

    @property
    @check_soup('_hash_id')
    def hash_id(self):
        """获取作者的内部hash id（用不到就忽视吧~）

        :return: 用户hash id
        :rtype: str
        """
        div = self.soup.find('div', class_='zm-profile-header-op-btns')
        if div is not None:
            return div.button['data-id']
        else:
            ga = self.soup.find('script', attrs={'data-name': 'ga_vars'})
            return json.loads(ga.text)['user_hash']

    @property
    @check_soup('_name', '_make_card')
    def name(self):
        """获取用户名字.

        :return: 用户名字
        :rtype: str
        """
        if self.url is None:
            return '匿名用户'
        if self.soup is not None:
            return self.soup.find('div', class_='title-section').span.text
        else:
            assert self.card is not None
            return self.card.find('span', class_='name').text

    @property
    @check_soup('_motto', '_make_card')
    def motto(self):
        """获取用户自我介绍，由于历史原因，我还是把这个属性叫做motto吧.

        :return: 用户自我介绍
        :rtype: str
        """
        if self.url is None:
            return ''
        else:
            if self.soup is not None:
                bar = self.soup.find(
                    'div', class_='title-section')
                if len(bar.contents) < 4:
                    return ''
                else:
                    return bar.contents[3].text
            else:
                assert self.card is not None
                motto = self.card.find('div', class_='tagline')
                return motto.text if motto is not None else ''

    @property
    @check_soup('_photo_url', '_make_card')
    def photo_url(self):
        """获取用户头像图片地址.

        :return: 用户头像url
        :rtype: str
        """
        if self.url is not None:
            if self.soup is not None:
                img = self.soup.find('img', class_='Avatar Avatar--l')['src']
                return img.replace('_l', '_r')
            else:
                assert (self.card is not None)
                return PROTOCOL + self.card.img['src'].replace('_xs', '_r')
        else:
            return 'http://pic1.zhimg.com/da8e974dc_r.jpg'

    @property
    @check_soup('_followee_num')
    def followee_num(self):
        """获取关注了多少人.

        :return: 关注的人数
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            number = int(self.soup.find(
                'div', class_='zm-profile-side-following').a.strong.text)
            return number

    @property
    @check_soup('_follower_num')
    def follower_num(self):
        """获取追随者数量，就是关注此人的人数.

        :return: 追随者数量
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            number = int(self.soup.find(
                'div', class_='zm-profile-side-following zg-clear').find_all(
                'a')[1].strong.text)
            return number

    @property
    @check_soup('_upvote_num')
    def upvote_num(self):
        """获取收到的的赞同数量.

        :return: 收到的的赞同数量
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            number = int(self.soup.find(
                'span', class_='zm-profile-header-user-agree').strong.text)
            return number

    @property
    @check_soup('_thank_num')
    def thank_num(self):
        """获取收到的感谢数量.

        :return: 收到的感谢数量
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            number = int(self.soup.find(
                'span', class_='zm-profile-header-user-thanks').strong.text)
            return number

    @property
    @check_soup('_weibo_url')
    def weibo_url(self):
        """获取用户微博链接.

        :return: 微博链接地址，如没有则返回 ‘unknown’
        :rtype: str
        """
        if self.url is None:
            return None
        else:
            tmp = self.soup.find(
                'a', class_='zm-profile-header-user-weibo')
            return tmp['href'] if tmp is not None else 'unknown'

    @property
    def business(self):
        """用户的行业.

        :return: 用户的行业，如没有则返回 ‘unknown’
        :rtype: str
        """
        return self._find_user_profile('business')

    @property
    def location(self):
        """用户的所在地.

        :return: 用户的所在地，如没有则返回 ‘unknown’
        :rtype: str
        """
        return self._find_user_profile('location')

    @property
    def education(self):
        """用户的教育状况.

        :return: 用户的教育状况，如没有则返回 ‘unknown’
        :rtype: str
        """
        return self._find_user_profile('education')

    def _find_user_profile(self, t):
        self._make_soup()
        if self.url is None:
            return 'unknown'
        else:
            res = self.soup.find(
                'span', class_=t)
            if res and res.has_attr('title'):
                return res['title']
            else:
                return 'unknown'

    @property
    @check_soup('_gender')
    def gender(self):
        """用户的性别.

        :return: 用户的性别（male/female/unknown)
        :rtype: str
        """
        if self.url is None:
            return 'unknown'
        else:
            return 'female' \
                if self.soup.find('i', class_='icon-profile-female') \
                else 'male'

    @property
    @check_soup('_question_num')
    def question_num(self):
        """获取提问数量.

        :return: 提问数量
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            return int(self._nav_list[1].span.text)

    @property
    @check_soup('_answer_num')
    def answer_num(self):
        """获取答案数量.

        :return: 答案数量
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            return int(self._nav_list[2].span.text)

    @property
    @check_soup('_post_num')
    def post_num(self):
        """获取专栏文章数量.

        :return: 专栏文章数量
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            return int(self._nav_list[3].span.text)

    @property
    @check_soup('_collection_num')
    def collection_num(self):
        """获取收藏夹数量.

        :return: 收藏夹数量
        :rtype: int
        """
        if self.url is None:
            return 0
        else:
            return int(self._nav_list[4].span.text)

    @property
    @check_soup('_followed_column_num')
    def followed_column_num(self):
        """获取用户关注的专栏数

        :return: 关注的专栏数
        :rtype: int
        """
        if self.url is not None:
            tag = self.soup.find('div', class_='zm-profile-side-columns')
            if tag is not None:
                return int(re_get_number.match(
                    tag.parent.strong.text).group(1))
        return 0

    @property
    @check_soup('_followed_topic_num')
    def followed_topic_num(self):
        """获取用户关注的话题数

        :return: 关注的话题数
        :rtype: int
        """
        if self.url is not None:
            tag = self.soup.find('div', class_='zm-profile-side-topics')
            if tag is not None:
                return int(re_get_number.match(
                    tag.parent.strong.text).group(1))
        return 0

    @property
    def questions(self):
        """获取用户的所有问题.

        :return: 用户的所有问题，返回生成器.
        :rtype: Question.Iterable
        """
        from .question import Question

        if self.url is None or self.question_num == 0:
            return
        for page_index in range(1, (self.question_num - 1) // 20 + 2):
            html = self._session.get(
                self.url + 'asks?page=' + str(page_index)).text
            soup = BeautifulSoup(html)
            question_links = soup.find_all('a', class_='question_link')
            question_datas = soup.find_all(
                'div', class_='zm-profile-section-main')
            for link, data in zip(question_links, question_datas):
                url = Zhihu_URL + link['href']
                title = link.text.strip()
                answer_num = int(
                    re_get_number.match(data.div.contents[4]).group(1))
                follower_num = int(
                    re_get_number.match(data.div.contents[6]).group(1))
                q = Question(url, title, follower_num, answer_num,
                             session=self._session)
                yield q

    @property
    def answers(self):
        """获取用户的所有答案.

        :return: 用户所有答案，返回生成器.
        :rtype: Answer.Iterable
        """
        from .question import Question
        from .answer import Answer

        if self.url is None or self.answer_num == 0:
            return
        for page_index in range(1, (self.answer_num - 1) // 20 + 2):
            html = self._session.get(
                self.url + 'answers?page=' + str(page_index)).text
            soup = BeautifulSoup(html)
            questions = soup.find_all('a', class_='question_link')
            upvotes = soup.find_all('a', class_='zm-item-vote-count')
            for q, upvote in zip(questions, upvotes):
                answer_url = Zhihu_URL + q['href']
                question_url = Zhihu_URL + re_a2q.match(q['href']).group(1)
                question_title = q.text
                upvote_num = upvote.text
                if upvote_num.isdigit():
                    upvote_num = int(upvote_num)
                else:
                    upvote_num = None
                question = Question(question_url, question_title,
                                    session=self._session)
                yield Answer(answer_url, question, self, upvote_num,
                             session=self._session)

    @property
    def followers(self):
        """获取关注此用户的人.

        :return: 关注此用户的人，返回生成器
        :rtype: Author.Iterable
        """
        for x in self._follow_ee_ers('er'):
            yield x

    @property
    def followees(self):
        """获取用户关注的人.

        :return: 用户关注的人的，返回生成器
        :rtype: Author.Iterable
        """
        for x in self._follow_ee_ers('ee'):
            yield x

[文档]    def followers_skip(self, skip):
        """获取关注此用户的人，跳过前 skip 个用户。

        :return: 关注此用户的人，返回生成器
        :rtype: Author.Iterable
        """
        for x in self._follow_ee_ers('er', skip):
            yield x

[文档]    def followees_skip(self, skip):
        """获取用户关注的人，跳过前 skip 个用户。

        :return: 用户关注的人的，返回生成器
        :rtype: Author.Iterable
        """
        for x in self._follow_ee_ers('ee', skip):
            yield x

    def _follow_ee_ers(self, t, skip=0):
        if self.url is None:
            return
        if t == 'er':
            request_url = Author_Get_More_Followers_URL
        else:
            request_url = Author_Get_More_Followees_URL
        self._make_card()
        if self.hash_id is None:
            self._make_soup()
        headers = dict(Default_Header)
        headers['Referer'] = self.url + 'follow' + t + 's'
        params = {"order_by": "created", "offset": 0, "hash_id": self.hash_id}
        data = {'_xsrf': self.xsrf, 'method': 'next', 'params': ''}
        gotten_date_num = 20
        offset = skip
        while gotten_date_num == 20:
            params['offset'] = offset
            data['params'] = json.dumps(params)
            res = self._session.post(request_url, data=data, headers=headers)
            json_data = res.json()
            gotten_date_num = len(json_data['msg'])
            offset += gotten_date_num
            for html in json_data['msg']:
                soup = BeautifulSoup(html)
                h2 = soup.find('h2')
                author_name = h2.a.text
                author_url = h2.a['href']
                author_motto = soup.find('span', class_='bio').text
                author_photo = PROTOCOL + soup.a.img['src'].replace('_m', '_r')
                numbers = [
                    int(re_get_number.match(x.text).group(1))
                    for x in soup.find_all('a', class_="zg-link-gray-normal")
                ]
                try:
                    yield Author(author_url, author_name, author_motto,
                                 *numbers,
                                 photo_url=author_photo, session=self._session)
                except ValueError:  # invalid url
                    yield ANONYMOUS

    @property
    def collections(self):
        """获取用户收藏夹.

        :return: 用户收藏夹，返回生成器
        :rtype: Collection.Iterable
        """
        from .collection import Collection

        if self.url is None or self.collection_num == 0:
            return
        else:
            collection_num = self.collection_num
            for page_index in range(1, (collection_num - 1) // 20 + 2):
                html = self._session.get(
                    self.url + 'collections?page=' + str(page_index)).text
                soup = BeautifulSoup(html)
                collections_names = soup.find_all(
                    'a', class_='zm-profile-fav-item-title')
                collection_follower_nums = soup.find_all(
                    'div', class_='zm-profile-fav-bio')
                for c, f in zip(collections_names, collection_follower_nums):
                    c_url = Zhihu_URL + c['href']
                    c_name = c.text
                    c_fn = int(re_get_number.match(f.contents[2]).group(1))
                    yield Collection(c_url, self, c_name, c_fn,
                                     session=self._session)

    @property
    def columns(self):
        """获取用户专栏.

        :return: 用户专栏，返回生成器
        :rtype: Column.Iterable
        """
        from .column import Column

        if self.url is None or self.post_num == 0:
            return
        soup = BeautifulSoup(self._session.get(self.url + 'posts').text)
        column_list = soup.find('div', class_='column-list')
        column_tags = column_list.find_all('div', class_='item')
        for column_tag in column_tags:
            name = column_tag['title']
            url = column_tag['data-href']
            numbers = column_tag.find('span', class_='des').text.split('•')
            follower_num = int(re_get_number.match(numbers[0]).group(1))
            if len(numbers) == 1:
                post_num = 0
            else:
                post_num = int(
                    re_get_number.match(numbers[1]).group(1))
            yield Column(url, name, follower_num, post_num,
                         session=self._session)

    @property
    def followed_columns(self):
        """获取用户关注的专栏.

        :return: 用户关注的专栏，返回生成器
        :rtype: Column.Iterable
        """
        from .column import Column
        if self.url is None:
            return
        if self.followed_column_num > 0:
            tag = self.soup.find('div', class_='zm-profile-side-columns')
            if tag is not None:
                for a in tag.find_all('a'):
                    yield Column(a['href'], a.img['alt'],
                                 session=self._session)
            if self.followed_column_num > 7:
                offset = 7
                gotten_data_num = 20
                while gotten_data_num == 20:
                    params = {
                        'hash_id': self.hash_id,
                        'limit': 20,
                        'offset': offset
                    }
                    data = {
                        'method': 'next',
                        '_xsrf': self.xsrf,
                        'params': json.dumps(params)
                    }
                    j = self._session.post(Author_Get_More_Follow_Column_URL,
                                           data=data).json()
                    gotten_data_num = len(j['msg'])
                    offset += gotten_data_num
                    for msg in map(BeautifulSoup, j['msg']):
                        name = msg.strong.text
                        url = msg.a['href']
                        post_num = int(re_get_number.match(
                            msg.span.text).group(1))
                        yield Column(url, name, post_num=post_num,
                                     session=self._session)

    @property
    def followed_topics(self):
        """获取用户关注的话题.

        :return: 用户关注的话题，返回生成器
        :rtype: Topic.Iterable
        """
        from .topic import Topic
        if self.url is None:
            return
        if self.followed_topic_num > 0:
            tag = self.soup.find('div', class_='zm-profile-side-topics')
            if tag is not None:
                for a in tag.find_all('a'):
                    yield Topic(Zhihu_URL + a['href'], a.img['alt'],
                                session=self._session)
            if self.followed_topic_num > 7:
                offset = 7
                gotten_data_num = 20
                while gotten_data_num == 20:
                    data = {'start': 0, 'offset': offset, '_xsrf': self.xsrf}
                    j = self._session.post(
                        Author_Get_More_Follow_Topic_URL.format(self.id),
                        data=data).json()
                    gotten_data_num = j['msg'][0]
                    offset += gotten_data_num
                    topic_item = BeautifulSoup(j['msg'][1]).find_all(
                        'div', class_='zm-profile-section-item')
                    for div in topic_item:
                        name = div.strong.text
                        url = Zhihu_URL + div.a['href']
                        yield Topic(url, name, session=self._session)

    @property
    def activities(self):
        """获取用户的最近动态.

        :return: 最近动态，返回生成器，具体说明见 :class:`.Activity`
        :rtype: Activity.Iterable
        """
        from .activity import Activity

        if self.url is None:
            return

        gotten_feed_num = 20
        start = '0'
        api_url = self.url + 'activities'
        while gotten_feed_num == 20:
            data = {'_xsrf': self.xsrf, 'start': start}
            res = self._session.post(api_url, data=data)
            gotten_feed_num = res.json()['msg'][0]
            soup = BeautifulSoup(res.json()['msg'][1])
            acts = soup.find_all(
                'div', class_='zm-profile-section-item zm-item clearfix')

            start = acts[-1]['data-time'] if len(acts) > 0 else 0
            for act in acts:
                # --- ignore Round Table temporarily ---
                if act.attrs['data-type-detail'] == "member_follow_roundtable":
                    continue
                # --- --- --- --- -- --- --- --- --- ---
                yield Activity(act, self._session, self)

    @property
    def last_activity_time(self):
        """获取用户最后一次活动的时间

        :return: 用户最后一次活动的时间，返回值为 unix 时间戳
        :rtype: int
        """
        self._make_soup()
        act = self.soup.find(
            'div', class_='zm-profile-section-item zm-item clearfix')
        return int(act['data-time']) if act is not None else -1

[文档]    def is_zero_user(self):
        """返回当前用户是否为三零用户，其实是四零： 赞同0，感谢0，提问0，回答0.

        :return: 是否是三零用户
        :rtype: bool
        """
        return self.upvote_num + self.thank_num + \
               self.question_num + self.answer_num == 0


class _Anonymous:
    def __init__(self):
        self.name = "匿名用户"
        self.url = ''


ANONYMOUS = _Anonymous()
"""匿名用户常量，通过 ``zhihu.ANONYMOUS`` 访问。

提问者、回答者、点赞者、问题关注者、评论者都可能是 ``ANONYMOUS``
"""